grim 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source :rubygems
2
+ gemspec
3
+ gem 'rspec'
4
+ gem 'ruby-debug'
data/README.textile ADDED
@@ -0,0 +1,17 @@
1
+ h1. Grim
2
+
3
+ Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
4
+
5
+ h2. Prerequisites
6
+
7
+ You will need ghostscript, imagemagick, and xpdf installed. On the Mac (OSX) I highly recommend using "Homebrew":http://mxcl.github.com/homebrew/ to get them installed, its as simple as "brew install ghostscript", "brew install imagemagick", and "brew install xpdf".
8
+
9
+ h2. Usage
10
+
11
+ <pre><code>
12
+ instance = Grim.new("/path/to/pdf")
13
+ page_count = instance.page_count # returns the number of pages in the pdf
14
+ png = instance.page(1).to_image("/path/to/save/image.png") # saves png to path and returns File instance
15
+ jpeg = instance.page(2).to_image("/path/to/save/image.jpeg") # saves jpeg to path and returns File instance
16
+ text = instance.page(3).text # returns text as a string
17
+ </pre></code>
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/grim.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "grim"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "grim"
7
+ s.version = Grim::VERSION
8
+ s.authors = ["Jonathan Hoyt"]
9
+ s.email = ["jonmagic@gmail.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Extract slides and text from a PDF.}
12
+ s.description = %q{Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.}
13
+
14
+ s.rubyforge_project = "grim"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
data/lib/grim.rb ADDED
@@ -0,0 +1,104 @@
1
+ # Grim is a class with instance methods for getting number of pages in a pdf,
2
+ # extracting a page as an image, and extracting the text from a page.
3
+ #
4
+ # For example:
5
+ #
6
+ # instance = Grim.new("/path/to/pdf")
7
+ # page_count = instance.page_count
8
+ # png = instance.page(1).to_png("/path/to/save/png")
9
+ # jpeg = instance.page(2).to_jpeg("/path/to/save/jpeg")
10
+ # text = instance.page(3).text
11
+ #
12
+ class Grim
13
+ # VERSION
14
+ VERSION = "0.1.0"
15
+
16
+ # Default resize output width, any positive integer
17
+ WIDTH = 1024
18
+
19
+ # Default image quality, 1 to 100
20
+ QUALITY = 90
21
+
22
+ # Default density, any positive integer
23
+ DENSITY = 300
24
+
25
+ # Default exception class for Grim.
26
+ class Exception < ::StandardError
27
+ end
28
+
29
+ # Exception that is raised if pdf is not found.
30
+ class PdfNotFound < Grim::Exception
31
+ end
32
+
33
+ # be able to store what page instance should focus on
34
+ attr_accessor :page_number
35
+
36
+ # initialize is called when a new instance is created and accepts path.
37
+ def initialize(path)
38
+ raise Grim::PdfNotFound unless File.exists?(path)
39
+ @page_number = 1
40
+ @path = path
41
+ end
42
+
43
+ # page_count uses the memoized path and shells out to ghostscript
44
+ # to read the pdf with the pdf_info.ps script as a filter,
45
+ # returning the number of pages in the pdf as an integer.
46
+ #
47
+ # For example:
48
+ #
49
+ # instance.page_count
50
+ # => 4
51
+ #
52
+ # Returns an integer.
53
+ def page_count
54
+ @page_count ||= begin
55
+ `gs -dNODISPLAY -q -sFile=#{@path} ./lib/pdf_info.ps`.to_i
56
+ end
57
+ end
58
+
59
+ # page just sets the page attribute on the instance.
60
+ #
61
+ # For example:
62
+ #
63
+ # instance.page(1)
64
+ # => instance
65
+ #
66
+ # Returns self.
67
+ def page(number)
68
+ @page_number = number
69
+ self
70
+ end
71
+
72
+ # Returns page_number minus 1
73
+ def index
74
+ @page_number - 1
75
+ end
76
+
77
+ # to_image extracts the selected page and turns it into an image.
78
+ # Tested on png and jpeg.
79
+ #
80
+ # For example:
81
+ #
82
+ # instance.page(2).to_image(/path/to/save/image)
83
+ # => File
84
+ #
85
+ # Returns an instance of File
86
+ def to_image(path)
87
+ `convert -resize #{Grim::WIDTH} -antialias -render -quality #{Grim::QUALITY} -colorspace RGB -interlace none -density #{Grim::DENSITY} #{@path}[#{index}] #{path}`
88
+ file = File.open(path)
89
+ file.rewind
90
+ file
91
+ end
92
+
93
+ # text is an instance method that extracts the text from the selected page.
94
+ #
95
+ # For example:
96
+ #
97
+ # instance.page(2).text
98
+ # => "This is text from slide 2.\n\nAnd even more text from slide 2."
99
+ #
100
+ # Returns a string
101
+ def text
102
+ `pdftotext -enc UTF-8 -f #{@page_number} -l #{@page_number} #{@path} -`
103
+ end
104
+ end
data/lib/pdf_info.ps ADDED
@@ -0,0 +1,21 @@
1
+ % usage: gs -dNODISPLAY -q -sFile=____.pdf pdf_info.ps
2
+
3
+ /File where not {
4
+ (\n *** Missing input file name \(use -sFile=____.pdf\)\n) =
5
+ ( usage: gs -dNODISPLAY -q -sFile=____.pdf [ options ] toolbin/pdf_info.ps\n) =
6
+ () =
7
+ flush
8
+ quit
9
+ } if
10
+ pop % discard the dict from where
11
+
12
+ /QUIET true def % in case they forgot
13
+
14
+ () =
15
+ File dup (r) file runpdfbegin
16
+ /PDFPageCount pdfpagecount def
17
+ PDFPageCount =print () =
18
+ flush
19
+ () =
20
+
21
+ quit
Binary file
@@ -0,0 +1,95 @@
1
+ require 'fileutils'
2
+ require 'spec_helper'
3
+
4
+ describe Grim do
5
+ after(:all) do
6
+ FileUtils.rm_rf(tmp_dir)
7
+ end
8
+
9
+ it "should have a VERSION constant" do
10
+ Grim.const_defined?('VERSION').should be_true
11
+ end
12
+
13
+ describe "Pdf" do
14
+ describe "#initialize" do
15
+ it "should raise an error if pdf does not exist" do
16
+ lambda { Grim.new(fixture_path("booboo.pdf")) }.should raise_error(Grim::PdfNotFound)
17
+ end
18
+ end
19
+
20
+ describe "#page_count" do
21
+ it "should return an integer" do
22
+ instance = Grim.new(fixture_path("smoker.pdf"))
23
+ instance.page_count.should == 25
24
+ end
25
+ end
26
+
27
+ describe "#page" do
28
+ it "should be set to 1 by default" do
29
+ instance = Grim.new(fixture_path("smoker.pdf"))
30
+ instance.page_number.should == 1
31
+ end
32
+
33
+ it "should set page attribute and return instance" do
34
+ instance = Grim.new(fixture_path("smoker.pdf"))
35
+ instance.page(2).should == instance
36
+ instance.page_number.should == 2
37
+ end
38
+ end
39
+
40
+ describe "#index" do
41
+ it "should return page minus 1" do
42
+ instance = Grim.new(fixture_path("smoker.pdf"))
43
+ instance.page(2)
44
+ instance.index.should == 1
45
+ end
46
+ end
47
+
48
+ describe "#to_image" do
49
+ describe "output png" do
50
+ before(:all) do
51
+ instance = Grim.new(fixture_path("smoker.pdf"))
52
+ @png = instance.to_image(tmp_path("to_png_spec.png"))
53
+ end
54
+
55
+ it "should create the file" do
56
+ File.exist?(tmp_path("to_png_spec.png")).should be_true
57
+ end
58
+
59
+ it "should return an instance of File" do
60
+ @png.class.should == File
61
+ end
62
+
63
+ it "should have the right file size" do
64
+ @png.stat.size.should == 188515
65
+ end
66
+ end
67
+
68
+ describe "output jpeg" do
69
+ before(:all) do
70
+ instance = Grim.new(fixture_path("smoker.pdf"))
71
+ @jpeg = instance.to_image(tmp_path("to_jpeg_spec.jpeg"))
72
+ end
73
+
74
+ it "should create the file" do
75
+ File.exist?(tmp_path("to_jpeg_spec.jpeg")).should be_true
76
+ end
77
+
78
+ it "should return an instance of File" do
79
+ @jpeg.class.should == File
80
+ end
81
+
82
+ it "should have the right file size" do
83
+ @jpeg.stat.size.should == 53980
84
+ end
85
+ end
86
+ end
87
+
88
+ describe "#text" do
89
+ it "should return the text from the selected page" do
90
+ instance = Grim.new(fixture_path("smoker.pdf"))
91
+ instance.page(2).text.should == "Step 1: get someone to print this curve for you to scale, 72\342\200\235 wide\n\nStep 2: Get a couple 55 gallon drums\n\n\f"
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,21 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'grim'
5
+
6
+ RSpec.configure do |config|
7
+ def fixture_path(name)
8
+ path = File.expand_path("./spec/fixtures/")
9
+ File.join(path, name)
10
+ end
11
+
12
+ def tmp_dir
13
+ path = File.expand_path("./tmp")
14
+ Dir.mkdir(path) unless File.directory?(path)
15
+ path
16
+ end
17
+
18
+ def tmp_path(name)
19
+ File.join(tmp_dir, name)
20
+ end
21
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grim
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Hoyt
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-09-05 00:00:00 -04:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
23
+ email:
24
+ - jonmagic@gmail.com
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - README.textile
35
+ - Rakefile
36
+ - grim.gemspec
37
+ - lib/grim.rb
38
+ - lib/pdf_info.ps
39
+ - spec/fixtures/smoker.pdf
40
+ - spec/lib/grim_spec.rb
41
+ - spec/spec_helper.rb
42
+ has_rdoc: true
43
+ homepage: ""
44
+ licenses: []
45
+
46
+ post_install_message:
47
+ rdoc_options: []
48
+
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ hash: 3
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ requirements: []
70
+
71
+ rubyforge_project: grim
72
+ rubygems_version: 1.6.2
73
+ signing_key:
74
+ specification_version: 3
75
+ summary: Extract slides and text from a PDF.
76
+ test_files:
77
+ - spec/fixtures/smoker.pdf
78
+ - spec/lib/grim_spec.rb
79
+ - spec/spec_helper.rb