grim 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Jonathan Hoyt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile CHANGED
@@ -1,17 +1,34 @@
1
1
  h1. Grim
2
2
 
3
- Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
3
+ Grim is a simple gem for extracting (reaping) a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
4
4
 
5
5
  h2. Prerequisites
6
6
 
7
- You will need ghostscript, imagemagick, and xpdf installed. On the Mac (OSX) I highly recommend using "Homebrew":http://mxcl.github.com/homebrew/ to get them installed, its as simple as "brew install ghostscript", "brew install imagemagick", and "brew install xpdf".
7
+ You will need ghostscript, imagemagick, and xpdf installed. On the Mac (OSX) I highly recommend using "Homebrew":http://mxcl.github.com/homebrew/ to get them installed.
8
+
9
+ <pre><code>
10
+ brew install ghostscript imagemagick xpdf
11
+ </code></pre>
12
+
13
+ h2. Installation
14
+
15
+ <pre><code>
16
+ gem install grim
17
+ </code></pre>
8
18
 
9
19
  h2. Usage
10
20
 
11
21
  <pre><code>
12
- instance = Grim.new("/path/to/pdf")
13
- page_count = instance.page_count # returns the number of pages in the pdf
14
- png = instance.page(1).to_image("/path/to/save/image.png") # saves png to path and returns File instance
15
- jpeg = instance.page(2).to_image("/path/to/save/image.jpeg") # saves jpeg to path and returns File instance
16
- text = instance.page(3).text # returns text as a string
17
- </pre></code>
22
+ pdf = Grim.reap("/path/to/pdf") # returns Grim::Pdf instance for pdf
23
+ count = pdf.count # returns the number of pages in the pdf
24
+ png = pdf[3].save('/path/to/image.png') # will return true if page was saved or false if not
25
+ text = pdf[3].text # returns text as a String
26
+
27
+ pdf.each do |page|
28
+ puts page.text
29
+ end
30
+ </pre></code>
31
+
32
+ h2. License
33
+
34
+ See LICENSE for details.
data/grim.gemspec CHANGED
@@ -7,11 +7,12 @@ Gem::Specification.new do |s|
7
7
  s.version = Grim::VERSION
8
8
  s.authors = ["Jonathan Hoyt"]
9
9
  s.email = ["jonmagic@gmail.com"]
10
- s.homepage = ""
10
+ s.homepage = "http://github.com/jonmagic/grim"
11
11
  s.summary = %q{Extract slides and text from a PDF.}
12
12
  s.description = %q{Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.}
13
13
 
14
14
  s.rubyforge_project = "grim"
15
+ s.add_dependency 'safe_shell', '~> 1.0.0'
15
16
 
16
17
  s.files = `git ls-files`.split("\n")
17
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
data/lib/grim.rb CHANGED
@@ -1,17 +1,8 @@
1
- # Grim is a class with instance methods for getting number of pages in a pdf,
2
- # extracting a page as an image, and extracting the text from a page.
3
- #
4
- # For example:
5
- #
6
- # instance = Grim.new("/path/to/pdf")
7
- # page_count = instance.page_count
8
- # png = instance.page(1).to_png("/path/to/save/png")
9
- # jpeg = instance.page(2).to_jpeg("/path/to/save/jpeg")
10
- # text = instance.page(3).text
11
- #
12
- class Grim
1
+ require 'safe_shell'
2
+
3
+ module Grim
13
4
  # VERSION
14
- VERSION = "0.1.0"
5
+ VERSION = "0.2.0"
15
6
 
16
7
  # Default resize output width, any positive integer
17
8
  WIDTH = 1024
@@ -30,75 +21,24 @@ class Grim
30
21
  class PdfNotFound < Grim::Exception
31
22
  end
32
23
 
33
- # be able to store what page instance should focus on
34
- attr_accessor :page_number
35
-
36
- # initialize is called when a new instance is created and accepts path.
37
- def initialize(path)
38
- raise Grim::PdfNotFound unless File.exists?(path)
39
- @page_number = 1
40
- @path = path
24
+ # Exception that is raised if pdf does not have page
25
+ class PageNotFound < Grim::Exception
41
26
  end
42
27
 
43
- # page_count uses the memoized path and shells out to ghostscript
44
- # to read the pdf with the pdf_info.ps script as a filter,
45
- # returning the number of pages in the pdf as an integer.
46
- #
47
- # For example:
48
- #
49
- # instance.page_count
50
- # => 4
28
+ # Creates and returns a new instance of Grim::Pdf
51
29
  #
52
- # Returns an integer.
53
- def page_count
54
- @page_count ||= begin
55
- `gs -dNODISPLAY -q -sFile=#{@path} ./lib/pdf_info.ps`.to_i
56
- end
57
- end
58
-
59
- # page just sets the page attribute on the instance.
30
+ # path - a path string or object
60
31
  #
61
32
  # For example:
62
33
  #
63
- # instance.page(1)
64
- # => instance
34
+ # pdf = Grim.reap(/path/to/pdf)
65
35
  #
66
- # Returns self.
67
- def page(number)
68
- @page_number = number
69
- self
70
- end
71
-
72
- # Returns page_number minus 1
73
- def index
74
- @page_number - 1
75
- end
76
-
77
- # to_image extracts the selected page and turns it into an image.
78
- # Tested on png and jpeg.
79
- #
80
- # For example:
36
+ # Returns an instance of Grim::Pdf
81
37
  #
82
- # instance.page(2).to_image(/path/to/save/image)
83
- # => File
84
- #
85
- # Returns an instance of File
86
- def to_image(path)
87
- `convert -resize #{Grim::WIDTH} -antialias -render -quality #{Grim::QUALITY} -colorspace RGB -interlace none -density #{Grim::DENSITY} #{@path}[#{index}] #{path}`
88
- file = File.open(path)
89
- file.rewind
90
- file
38
+ def self.reap(path)
39
+ Grim::Pdf.new(path)
91
40
  end
41
+ end
92
42
 
93
- # text is an instance method that extracts the text from the selected page.
94
- #
95
- # For example:
96
- #
97
- # instance.page(2).text
98
- # => "This is text from slide 2.\n\nAnd even more text from slide 2."
99
- #
100
- # Returns a string
101
- def text
102
- `pdftotext -enc UTF-8 -f #{@page_number} -l #{@page_number} #{@path} -`
103
- end
104
- end
43
+ require 'grim/pdf'
44
+ require 'grim/page'
data/lib/grim/page.rb ADDED
@@ -0,0 +1,50 @@
1
+ module Grim
2
+ class Page
3
+
4
+ attr_reader :number
5
+
6
+ # Sets up some instance variables on new instance.
7
+ #
8
+ # pdf - the pdf this page belongs to
9
+ # index - the index of the page in the array of pages
10
+ #
11
+ def initialize(pdf, index)
12
+ @pdf = pdf
13
+ @index = index
14
+ @number = index + 1
15
+ end
16
+
17
+ # Extracts the selected page and turns it into an image.
18
+ # Tested on png and jpeg.
19
+ #
20
+ # path - String of the path to save to
21
+ #
22
+ # For example:
23
+ #
24
+ # pdf[1].save(/path/to/save/image.png)
25
+ # # => true
26
+ #
27
+ # Returns a File.
28
+ #
29
+ def save(path)
30
+ SafeShell.execute("convert", "-resize", Grim::WIDTH, "-antialias", "-render",
31
+ "-quality", Grim::QUALITY, "-colorspace", "RGB",
32
+ "-interlace", "none", "-density", Grim::DENSITY,
33
+ "#{@pdf.path}[#{@index}]", path)
34
+ File.exists?(path)
35
+ end
36
+
37
+ # Extracts the text from the selected page.
38
+ #
39
+ # For example:
40
+ #
41
+ # pdf[1].text
42
+ # # => "This is text from slide 2.\n\nAnd even more text from slide 2."
43
+ #
44
+ # Returns a String.
45
+ #
46
+ def text
47
+ SafeShell.execute("pdftotext", "-enc", "UTF-8", "-f", @number, "-l", @number, @pdf.path, "-")
48
+ end
49
+ end
50
+ end
data/lib/grim/pdf.rb ADDED
@@ -0,0 +1,60 @@
1
+ module Grim
2
+ class Pdf
3
+ include Enumerable
4
+
5
+ attr_reader :path
6
+
7
+ # ghostscript prints out a warning, this regex matches it
8
+ WarningRegex = /\*\*\*\*.*\n/
9
+
10
+ # Raises an error if pdf not found and sets some instance
11
+ # variables if pdf is found.
12
+ #
13
+ # path - A String or Path to the pdf
14
+ #
15
+ def initialize(path)
16
+ raise Grim::PdfNotFound unless File.exists?(path)
17
+ @path = path
18
+ end
19
+
20
+ # Shells out to ghostscript to read the pdf with the pdf_info.ps script
21
+ # as a filter, returning the number of pages in the pdf as an integer.
22
+ #
23
+ # For example:
24
+ #
25
+ # pdf.count
26
+ # # => 4
27
+ #
28
+ # Returns an Integer.
29
+ #
30
+ def count
31
+ @count ||= begin
32
+ result = SafeShell.execute("gs", "-dNODISPLAY", "-q", "-sFile=#{@path}", "./lib/pdf_info.ps")
33
+
34
+ result.gsub(WarningRegex, '').to_i
35
+ end
36
+ end
37
+
38
+ # Creates an instance Grim::Page for the index passed in.
39
+ #
40
+ # index - accepts Integer for position in array
41
+ #
42
+ # For example:
43
+ #
44
+ # pdf[4] # returns 5th page
45
+ #
46
+ # Returns an instance of Grim::Page.
47
+ #
48
+ def [](index)
49
+ raise Grim::PageNotFound unless index >= 0 && index < count
50
+ Grim::Page.new(self, index)
51
+ end
52
+
53
+ def each
54
+ (0..(count-1)).each do |index|
55
+ yield Grim::Page.new(self, index)
56
+ end
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,36 @@
1
+ require 'fileutils'
2
+ require 'spec_helper'
3
+
4
+ describe Grim::Page do
5
+ after(:all) do
6
+ FileUtils.rm_rf(tmp_dir)
7
+ end
8
+
9
+ it "should have number" do
10
+ Grim::Page.new(Grim::Pdf.new(fixture_path("smoker.pdf")), 1).number.should == 2
11
+ end
12
+
13
+ describe "#save" do
14
+ before(:all) do
15
+ pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
16
+ pdf[0].save(tmp_path("to_png_spec.png"))
17
+ @file = File.open(tmp_path("to_png_spec.png"))
18
+ end
19
+
20
+ it "should create the file" do
21
+ File.exist?(tmp_path("to_png_spec.png")).should be_true
22
+ end
23
+
24
+ it "should have the right file size" do
25
+ @file.stat.size.should == 188515
26
+ end
27
+ end
28
+
29
+
30
+ describe "#text" do
31
+ it "should return the text from the selected page" do
32
+ pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
33
+ pdf[1].text.should == "Step 1: get someone to print this curve for you to scale, 72\342\200\235 wide\n\nStep 2: Get a couple 55 gallon drums\n\n\f"
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,48 @@
1
+ require 'spec_helper'
2
+
3
+ describe Grim::Pdf do
4
+
5
+ it "should have a path" do
6
+ Grim::Pdf.new(fixture_path("smoker.pdf")).path.should == fixture_path("smoker.pdf")
7
+ end
8
+
9
+ describe "#initialize" do
10
+ it "should raise an error if pdf does not exist" do
11
+ lambda { Grim::Pdf.new(fixture_path("booboo.pdf")) }.should raise_error(Grim::PdfNotFound)
12
+ end
13
+
14
+ it "should set path on pdf" do
15
+ pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
16
+ pdf.path.should == fixture_path("smoker.pdf")
17
+ end
18
+ end
19
+
20
+ describe "#count" do
21
+ it "should return 25" do
22
+ pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
23
+ pdf.count.should == 25
24
+ end
25
+ end
26
+
27
+ describe "#[]" do
28
+ before(:each) do
29
+ @pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
30
+ end
31
+
32
+ it "should raise Grim::PageDoesNotExist if page doesn't exist" do
33
+ lambda { @pdf[25] }.should raise_error(Grim::PageNotFound)
34
+ end
35
+
36
+ it "should return an instance of Grim::Page if page exists" do
37
+ @pdf[24].class.should == Grim::Page
38
+ end
39
+ end
40
+
41
+ describe "#each" do
42
+ it "should be iterable" do
43
+ pdf = Grim::Pdf.new(fixture_path("smoker.pdf"))
44
+ pdf.map {|p| p.number }.should == (1..25).to_a
45
+ end
46
+ end
47
+
48
+ end
@@ -1,95 +1,25 @@
1
- require 'fileutils'
2
1
  require 'spec_helper'
3
2
 
4
3
  describe Grim do
5
- after(:all) do
6
- FileUtils.rm_rf(tmp_dir)
7
- end
8
-
9
4
  it "should have a VERSION constant" do
10
5
  Grim.const_defined?('VERSION').should be_true
11
6
  end
12
7
 
13
- describe "Pdf" do
14
- describe "#initialize" do
15
- it "should raise an error if pdf does not exist" do
16
- lambda { Grim.new(fixture_path("booboo.pdf")) }.should raise_error(Grim::PdfNotFound)
17
- end
18
- end
19
-
20
- describe "#page_count" do
21
- it "should return an integer" do
22
- instance = Grim.new(fixture_path("smoker.pdf"))
23
- instance.page_count.should == 25
24
- end
25
- end
26
-
27
- describe "#page" do
28
- it "should be set to 1 by default" do
29
- instance = Grim.new(fixture_path("smoker.pdf"))
30
- instance.page_number.should == 1
31
- end
32
-
33
- it "should set page attribute and return instance" do
34
- instance = Grim.new(fixture_path("smoker.pdf"))
35
- instance.page(2).should == instance
36
- instance.page_number.should == 2
37
- end
38
- end
39
-
40
- describe "#index" do
41
- it "should return page minus 1" do
42
- instance = Grim.new(fixture_path("smoker.pdf"))
43
- instance.page(2)
44
- instance.index.should == 1
45
- end
46
- end
47
-
48
- describe "#to_image" do
49
- describe "output png" do
50
- before(:all) do
51
- instance = Grim.new(fixture_path("smoker.pdf"))
52
- @png = instance.to_image(tmp_path("to_png_spec.png"))
53
- end
54
-
55
- it "should create the file" do
56
- File.exist?(tmp_path("to_png_spec.png")).should be_true
57
- end
58
-
59
- it "should return an instance of File" do
60
- @png.class.should == File
61
- end
62
-
63
- it "should have the right file size" do
64
- @png.stat.size.should == 188515
65
- end
66
- end
67
-
68
- describe "output jpeg" do
69
- before(:all) do
70
- instance = Grim.new(fixture_path("smoker.pdf"))
71
- @jpeg = instance.to_image(tmp_path("to_jpeg_spec.jpeg"))
72
- end
73
-
74
- it "should create the file" do
75
- File.exist?(tmp_path("to_jpeg_spec.jpeg")).should be_true
76
- end
8
+ it "should have WIDTH constant set to 1024" do
9
+ Grim::WIDTH.should == 1024
10
+ end
77
11
 
78
- it "should return an instance of File" do
79
- @jpeg.class.should == File
80
- end
12
+ it "should have QUALITY constant set to 90" do
13
+ Grim::QUALITY.should == 90
14
+ end
81
15
 
82
- it "should have the right file size" do
83
- @jpeg.stat.size.should == 53980
84
- end
85
- end
86
- end
16
+ it "should have DENSITY constant set to 300" do
17
+ Grim::DENSITY.should == 300
18
+ end
87
19
 
88
- describe "#text" do
89
- it "should return the text from the selected page" do
90
- instance = Grim.new(fixture_path("smoker.pdf"))
91
- instance.page(2).text.should == "Step 1: get someone to print this curve for you to scale, 72\342\200\235 wide\n\nStep 2: Get a couple 55 gallon drums\n\n\f"
92
- end
20
+ describe "#new" do
21
+ it "should return an instance of Grim::Pdf" do
22
+ Grim.reap(fixture_path("smoker.pdf")).class.should == Grim::Pdf
93
23
  end
94
24
  end
95
25
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grim
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 1
8
+ - 2
9
9
  - 0
10
- version: 0.1.0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jonathan Hoyt
@@ -15,10 +15,25 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-09-05 00:00:00 -04:00
18
+ date: 2011-09-06 00:00:00 -04:00
19
19
  default_executable:
20
- dependencies: []
21
-
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: safe_shell
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 0
34
+ version: 1.0.0
35
+ type: :runtime
36
+ version_requirements: *id001
22
37
  description: Grim is a simple gem for extracting a page from a pdf and converting it to an image as well as extract the text from the page as a string. It basically gives you an easy to use api to ghostscript, imagemagick, and pdftotext specific to this use case.
23
38
  email:
24
39
  - jonmagic@gmail.com
@@ -31,16 +46,21 @@ extra_rdoc_files: []
31
46
  files:
32
47
  - .gitignore
33
48
  - Gemfile
49
+ - LICENSE
34
50
  - README.textile
35
51
  - Rakefile
36
52
  - grim.gemspec
37
53
  - lib/grim.rb
54
+ - lib/grim/page.rb
55
+ - lib/grim/pdf.rb
38
56
  - lib/pdf_info.ps
39
57
  - spec/fixtures/smoker.pdf
58
+ - spec/lib/grim/page_spec.rb
59
+ - spec/lib/grim/pdf_spec.rb
40
60
  - spec/lib/grim_spec.rb
41
61
  - spec/spec_helper.rb
42
62
  has_rdoc: true
43
- homepage: ""
63
+ homepage: http://github.com/jonmagic/grim
44
64
  licenses: []
45
65
 
46
66
  post_install_message:
@@ -75,5 +95,7 @@ specification_version: 3
75
95
  summary: Extract slides and text from a PDF.
76
96
  test_files:
77
97
  - spec/fixtures/smoker.pdf
98
+ - spec/lib/grim/page_spec.rb
99
+ - spec/lib/grim/pdf_spec.rb
78
100
  - spec/lib/grim_spec.rb
79
101
  - spec/spec_helper.rb