grim 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :rubygems
2
+ gemspec
3
+ gem 'rspec'
4
+ gem 'ruby-debug'
data/README.textile ADDED
@@ -0,0 +1,17 @@
1
+ h1. Grim
2
+
3
+ Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
4
+
5
+ h2. Prerequisites
6
+
7
+ You will need ghostscript, imagemagick, and xpdf installed. On the Mac (OSX) I highly recommend using "Homebrew":http://mxcl.github.com/homebrew/ to get them installed, its as simple as "brew install ghostscript", "brew install imagemagick", and "brew install xpdf".
8
+
9
+ h2. Usage
10
+
11
+ <pre><code>
12
+ instance = Grim.new("/path/to/pdf")
13
+ page_count = instance.page_count # returns the number of pages in the pdf
14
+ png = instance.page(1).to_image("/path/to/save/image.png") # saves png to path and returns File instance
15
+ jpeg = instance.page(2).to_image("/path/to/save/image.jpeg") # saves jpeg to path and returns File instance
16
+ text = instance.page(3).text # returns text as a string
17
+ </pre></code>
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/grim.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "grim"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "grim"
7
+ s.version = Grim::VERSION
8
+ s.authors = ["Jonathan Hoyt"]
9
+ s.email = ["jonmagic@gmail.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Extract slides and text from a PDF.}
12
+ s.description = %q{Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.}
13
+
14
+ s.rubyforge_project = "grim"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
data/lib/grim.rb ADDED
@@ -0,0 +1,104 @@
1
+ # Grim is a class with instance methods for getting number of pages in a pdf,
2
+ # extracting a page as an image, and extracting the text from a page.
3
+ #
4
+ # For example:
5
+ #
6
+ # instance = Grim.new("/path/to/pdf")
7
+ # page_count = instance.page_count
8
+ # png = instance.page(1).to_png("/path/to/save/png")
9
+ # jpeg = instance.page(2).to_jpeg("/path/to/save/jpeg")
10
+ # text = instance.page(3).text
11
+ #
12
+ class Grim
13
+ # VERSION
14
+ VERSION = "0.1.0"
15
+
16
+ # Default resize output width, any positive integer
17
+ WIDTH = 1024
18
+
19
+ # Default image quality, 1 to 100
20
+ QUALITY = 90
21
+
22
+ # Default density, any positive integer
23
+ DENSITY = 300
24
+
25
+ # Default exception class for Grim.
26
+ class Exception < ::StandardError
27
+ end
28
+
29
+ # Exception that is raised if pdf is not found.
30
+ class PdfNotFound < Grim::Exception
31
+ end
32
+
33
+ # be able to store what page instance should focus on
34
+ attr_accessor :page_number
35
+
36
+ # initialize is called when a new instance is created and accepts path.
37
+ def initialize(path)
38
+ raise Grim::PdfNotFound unless File.exists?(path)
39
+ @page_number = 1
40
+ @path = path
41
+ end
42
+
43
+ # page_count uses the memoized path and shells out to ghostscript
44
+ # to read the pdf with the pdf_info.ps script as a filter,
45
+ # returning the number of pages in the pdf as an integer.
46
+ #
47
+ # For example:
48
+ #
49
+ # instance.page_count
50
+ # => 4
51
+ #
52
+ # Returns an integer.
53
+ def page_count
54
+ @page_count ||= begin
55
+ `gs -dNODISPLAY -q -sFile=#{@path} ./lib/pdf_info.ps`.to_i
56
+ end
57
+ end
58
+
59
+ # page just sets the page attribute on the instance.
60
+ #
61
+ # For example:
62
+ #
63
+ # instance.page(1)
64
+ # => instance
65
+ #
66
+ # Returns self.
67
+ def page(number)
68
+ @page_number = number
69
+ self
70
+ end
71
+
72
+ # Returns page_number minus 1
73
+ def index
74
+ @page_number - 1
75
+ end
76
+
77
+ # to_image extracts the selected page and turns it into an image.
78
+ # Tested on png and jpeg.
79
+ #
80
+ # For example:
81
+ #
82
+ # instance.page(2).to_image(/path/to/save/image)
83
+ # => File
84
+ #
85
+ # Returns an instance of File
86
+ def to_image(path)
87
+ `convert -resize #{Grim::WIDTH} -antialias -render -quality #{Grim::QUALITY} -colorspace RGB -interlace none -density #{Grim::DENSITY} #{@path}[#{index}] #{path}`
88
+ file = File.open(path)
89
+ file.rewind
90
+ file
91
+ end
92
+
93
+ # text is an instance method that extracts the text from the selected page.
94
+ #
95
+ # For example:
96
+ #
97
+ # instance.page(2).text
98
+ # => "This is text from slide 2.\n\nAnd even more text from slide 2."
99
+ #
100
+ # Returns a string
101
+ def text
102
+ `pdftotext -enc UTF-8 -f #{@page_number} -l #{@page_number} #{@path} -`
103
+ end
104
+ end
data/lib/pdf_info.ps ADDED
@@ -0,0 +1,21 @@
1
+ % usage: gs -dNODISPLAY -q -sFile=____.pdf pdf_info.ps
2
+
3
+ /File where not {
4
+ (\n *** Missing input file name \(use -sFile=____.pdf\)\n) =
5
+ ( usage: gs -dNODISPLAY -q -sFile=____.pdf [ options ] toolbin/pdf_info.ps\n) =
6
+ () =
7
+ flush
8
+ quit
9
+ } if
10
+ pop % discard the dict from where
11
+
12
+ /QUIET true def % in case they forgot
13
+
14
+ () =
15
+ File dup (r) file runpdfbegin
16
+ /PDFPageCount pdfpagecount def
17
+ PDFPageCount =print () =
18
+ flush
19
+ () =
20
+
21
+ quit
Binary file
@@ -0,0 +1,95 @@
1
+ require 'fileutils'
2
+ require 'spec_helper'
3
+
4
+ describe Grim do
5
+ after(:all) do
6
+ FileUtils.rm_rf(tmp_dir)
7
+ end
8
+
9
+ it "should have a VERSION constant" do
10
+ Grim.const_defined?('VERSION').should be_true
11
+ end
12
+
13
+ describe "Pdf" do
14
+ describe "#initialize" do
15
+ it "should raise an error if pdf does not exist" do
16
+ lambda { Grim.new(fixture_path("booboo.pdf")) }.should raise_error(Grim::PdfNotFound)
17
+ end
18
+ end
19
+
20
+ describe "#page_count" do
21
+ it "should return an integer" do
22
+ instance = Grim.new(fixture_path("smoker.pdf"))
23
+ instance.page_count.should == 25
24
+ end
25
+ end
26
+
27
+ describe "#page" do
28
+ it "should be set to 1 by default" do
29
+ instance = Grim.new(fixture_path("smoker.pdf"))
30
+ instance.page_number.should == 1
31
+ end
32
+
33
+ it "should set page attribute and return instance" do
34
+ instance = Grim.new(fixture_path("smoker.pdf"))
35
+ instance.page(2).should == instance
36
+ instance.page_number.should == 2
37
+ end
38
+ end
39
+
40
+ describe "#index" do
41
+ it "should return page minus 1" do
42
+ instance = Grim.new(fixture_path("smoker.pdf"))
43
+ instance.page(2)
44
+ instance.index.should == 1
45
+ end
46
+ end
47
+
48
+ describe "#to_image" do
49
+ describe "output png" do
50
+ before(:all) do
51
+ instance = Grim.new(fixture_path("smoker.pdf"))
52
+ @png = instance.to_image(tmp_path("to_png_spec.png"))
53
+ end
54
+
55
+ it "should create the file" do
56
+ File.exist?(tmp_path("to_png_spec.png")).should be_true
57
+ end
58
+
59
+ it "should return an instance of File" do
60
+ @png.class.should == File
61
+ end
62
+
63
+ it "should have the right file size" do
64
+ @png.stat.size.should == 188515
65
+ end
66
+ end
67
+
68
+ describe "output jpeg" do
69
+ before(:all) do
70
+ instance = Grim.new(fixture_path("smoker.pdf"))
71
+ @jpeg = instance.to_image(tmp_path("to_jpeg_spec.jpeg"))
72
+ end
73
+
74
+ it "should create the file" do
75
+ File.exist?(tmp_path("to_jpeg_spec.jpeg")).should be_true
76
+ end
77
+
78
+ it "should return an instance of File" do
79
+ @jpeg.class.should == File
80
+ end
81
+
82
+ it "should have the right file size" do
83
+ @jpeg.stat.size.should == 53980
84
+ end
85
+ end
86
+ end
87
+
88
+ describe "#text" do
89
+ it "should return the text from the selected page" do
90
+ instance = Grim.new(fixture_path("smoker.pdf"))
91
+ instance.page(2).text.should == "Step 1: get someone to print this curve for you to scale, 72\342\200\235 wide\n\nStep 2: Get a couple 55 gallon drums\n\n\f"
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'grim'
5
+
6
+ RSpec.configure do |config|
7
+ def fixture_path(name)
8
+ path = File.expand_path("./spec/fixtures/")
9
+ File.join(path, name)
10
+ end
11
+
12
+ def tmp_dir
13
+ path = File.expand_path("./tmp")
14
+ Dir.mkdir(path) unless File.directory?(path)
15
+ path
16
+ end
17
+
18
+ def tmp_path(name)
19
+ File.join(tmp_dir, name)
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grim
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Hoyt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-09-05 00:00:00 -04:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
23
+ email:
24
+ - jonmagic@gmail.com
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - README.textile
35
+ - Rakefile
36
+ - grim.gemspec
37
+ - lib/grim.rb
38
+ - lib/pdf_info.ps
39
+ - spec/fixtures/smoker.pdf
40
+ - spec/lib/grim_spec.rb
41
+ - spec/spec_helper.rb
42
+ has_rdoc: true
43
+ homepage: ""
44
+ licenses: []
45
+
46
+ post_install_message:
47
+ rdoc_options: []
48
+
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ hash: 3
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ requirements: []
70
+
71
+ rubyforge_project: grim
72
+ rubygems_version: 1.6.2
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Extract slides and text from a PDF.
76
+ test_files:
77
+ - spec/fixtures/smoker.pdf
78
+ - spec/lib/grim_spec.rb
79
+ - spec/spec_helper.rb