pdf-reader 0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/README +177 -0
- data/Rakefile +84 -0
- data/TODO +9 -0
- data/lib/pdf/reader.rb +106 -0
- data/lib/pdf/reader/buffer.rb +144 -0
- data/lib/pdf/reader/content.rb +289 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/explore.rb +116 -0
- data/lib/pdf/reader/filter.rb +62 -0
- data/lib/pdf/reader/name.rb +37 -0
- data/lib/pdf/reader/parser.rb +203 -0
- data/lib/pdf/reader/reference.rb +55 -0
- data/lib/pdf/reader/register_receiver.rb +18 -0
- data/lib/pdf/reader/text_receiver.rb +259 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +101 -0
- metadata +70 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
The PDF::Reader library implements a PDF parser conforming as much as possible
|
2
|
+
to the PDF specification from Adobe.
|
3
|
+
|
4
|
+
It provides programmatic access to the contents of a PDF file with a high
|
5
|
+
degree of flexibility.
|
6
|
+
|
7
|
+
The PDF 1.7 specification is a weighty document and not all aspects are
|
8
|
+
currently supported. We welcome submission of PDF files that exhibit
|
9
|
+
unsupported aspects of the spec to assist with improving out support.
|
10
|
+
|
11
|
+
= Installation
|
12
|
+
|
13
|
+
The recommended installation method is via Rubygems.
|
14
|
+
|
15
|
+
gem install pdf-reader
|
16
|
+
|
17
|
+
= Usage
|
18
|
+
|
19
|
+
PDF::Reader is designed with a callback-style architecture. The basic concept
|
20
|
+
is to build a receiver class and pass that into PDF::Reader along with the PDF
|
21
|
+
to process.
|
22
|
+
|
23
|
+
As PDF::Reader walks the file and encounters various objects (pages, text,
|
24
|
+
images, shapes, etc) it will call methods on the receiver class. What those
|
25
|
+
methods do is entirely up to you - save the text, extract images, count pages,
|
26
|
+
read metadata, whatever.
|
27
|
+
|
28
|
+
For a full list of the supported callback methods and a description of when they
|
29
|
+
will be called, refer to PDF::Reader::Content.
|
30
|
+
|
31
|
+
= Exceptions
|
32
|
+
|
33
|
+
There are two key exceptions that you will need to watch out for when processing a
|
34
|
+
PDF file:
|
35
|
+
|
36
|
+
MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
|
37
|
+
file should be valid, or that a corrupt file didn't raise an exception, please
|
38
|
+
forward a copy of the file to the maintainers and we can attempt improve the code.
|
39
|
+
|
40
|
+
UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
|
41
|
+
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
42
|
+
us with future code improvements.
|
43
|
+
|
44
|
+
= Maintainers
|
45
|
+
|
46
|
+
- Peter Jones <mailto:pjones@pmade.com>
|
47
|
+
- James Healy <mailto:jimmy@deefa.com>
|
48
|
+
|
49
|
+
= Examples
|
50
|
+
|
51
|
+
The easiest way to explain how this works in practice is to show some examples.
|
52
|
+
|
53
|
+
== Page Counter
|
54
|
+
|
55
|
+
A simple app to count the number of pages in a PDF File.
|
56
|
+
|
57
|
+
require 'rubygems'
|
58
|
+
require 'pdf/reader'
|
59
|
+
|
60
|
+
class PageReceiver
|
61
|
+
attr_accessor :page_count
|
62
|
+
|
63
|
+
def initialize
|
64
|
+
@page_count = 0
|
65
|
+
end
|
66
|
+
|
67
|
+
# Called when page parsing ends
|
68
|
+
def end_page
|
69
|
+
@page_count += 1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
receiver = PageReceiver.new
|
74
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
75
|
+
puts "#{pdf.page_count} pages"
|
76
|
+
|
77
|
+
== Basic RSpec of a generated PDF
|
78
|
+
|
79
|
+
require 'rubygems'
|
80
|
+
require 'pdf/reader'
|
81
|
+
require 'pdf/writer'
|
82
|
+
require 'spec'
|
83
|
+
|
84
|
+
class PageTextReceiver
|
85
|
+
attr_accessor :content
|
86
|
+
|
87
|
+
def initialize
|
88
|
+
@content = []
|
89
|
+
end
|
90
|
+
|
91
|
+
# Called when page parsing starts
|
92
|
+
def begin_page(arg = nil)
|
93
|
+
@content << ""
|
94
|
+
end
|
95
|
+
|
96
|
+
def show_text(string, *params)
|
97
|
+
@content.last << string.strip
|
98
|
+
end
|
99
|
+
|
100
|
+
# there's a few text callbacks, so make sure we process them all
|
101
|
+
alias :super_show_text :show_text
|
102
|
+
alias :move_to_next_line_and_show_text :show_text
|
103
|
+
alias :set_spacing_next_line_show_text :show_text
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
context "My generated PDF" do
|
108
|
+
specify "should have the correct text on 2 pages" do
|
109
|
+
|
110
|
+
# generate our PDF
|
111
|
+
pdf = PDF::Writer.new
|
112
|
+
pdf.text "Chunky", :font_size => 32, :justification => :center
|
113
|
+
pdf.start_new_page
|
114
|
+
pdf.text "Bacon", :font_size => 32, :justification => :center
|
115
|
+
pdf.save_as("chunkybacon.pdf")
|
116
|
+
|
117
|
+
# process the PDF
|
118
|
+
receiver = PageTextReceiver.new
|
119
|
+
PDF::Reader.file("chunkybacon.pdf", receiver)
|
120
|
+
|
121
|
+
# confirm the text appears on the correct pages
|
122
|
+
receiver.content.size.should eql(2)
|
123
|
+
receiver.content[0].should eql("Chunky")
|
124
|
+
receiver.content[1].should eql("Bacon")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
== Extract ISBNs
|
129
|
+
|
130
|
+
Parse all text in the requested PDF file and print out any valid book ISBNs.
|
131
|
+
Requires the rbook-isbn gem.
|
132
|
+
|
133
|
+
require 'rubygems'
|
134
|
+
require 'pdf/reader'
|
135
|
+
require 'rbook/isbn'
|
136
|
+
|
137
|
+
class ISBNReceiver
|
138
|
+
|
139
|
+
# there's a few text callbacks, so make sure we process them all
|
140
|
+
def show_text(string, *params)
|
141
|
+
process_words(string.split(/\W+/))
|
142
|
+
end
|
143
|
+
|
144
|
+
def super_show_text(string, *params)
|
145
|
+
process_words(string.split(/\W+/))
|
146
|
+
end
|
147
|
+
|
148
|
+
def move_to_next_line_and_show_text (string)
|
149
|
+
process_words(string.split(/\W+/))
|
150
|
+
end
|
151
|
+
|
152
|
+
def set_spacing_next_line_show_text (aw, ac, string)
|
153
|
+
process_words(string.split(/\W+/))
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
# check if any items in the supplied array are a valid ISBN, and print any
|
159
|
+
# that are to console
|
160
|
+
def process_words(words)
|
161
|
+
words.each do |word|
|
162
|
+
word.strip!
|
163
|
+
puts "#{RBook::ISBN.convert_to_isbn13(word)}" if RBook::ISBN.valid_isbn?(word)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
receiver = ISBNReceiver.new
|
169
|
+
PDF::Reader.file("somefile.pdf", receiver)
|
170
|
+
|
171
|
+
|
172
|
+
= Resources
|
173
|
+
|
174
|
+
- PDF::Reader Homepage: http://software.pmade.com/pdfreader
|
175
|
+
- PDF::Reader Rubyforge Page: http://rubyforge.org/projects/pdf-reader/
|
176
|
+
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
177
|
+
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/Rakefile
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/rdoctask'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require "rake/gempackagetask"
|
7
|
+
require 'spec/rake/spectask'
|
8
|
+
|
9
|
+
PKG_VERSION = "0.5"
|
10
|
+
PKG_NAME = "pdf-reader"
|
11
|
+
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
|
+
|
13
|
+
desc "Default Task"
|
14
|
+
task :default => [ :spec ]
|
15
|
+
|
16
|
+
# run all rspecs
|
17
|
+
desc "Run all rspec files"
|
18
|
+
Spec::Rake::SpecTask.new("spec") do |t|
|
19
|
+
t.spec_files = FileList['specs/**/*.rb']
|
20
|
+
t.rcov = true
|
21
|
+
t.rcov_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + "/rcov"
|
22
|
+
# t.rcov_opts = ["--exclude","spec.*\.rb"]
|
23
|
+
end
|
24
|
+
|
25
|
+
# generate specdocs
|
26
|
+
desc "Generate Specdocs"
|
27
|
+
Spec::Rake::SpecTask.new("specdocs") do |t|
|
28
|
+
t.spec_files = FileList['specs/**/*.rb']
|
29
|
+
t.spec_opts = ["--format", "rdoc"]
|
30
|
+
t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/specdoc.rd'
|
31
|
+
end
|
32
|
+
|
33
|
+
# generate failing spec report
|
34
|
+
desc "Generate failing spec report"
|
35
|
+
Spec::Rake::SpecTask.new("spec_report") do |t|
|
36
|
+
t.spec_files = FileList['specs/**/*.rb']
|
37
|
+
t.spec_opts = ["--format", "html", "--diff"]
|
38
|
+
t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/spec_report.html'
|
39
|
+
t.fail_on_error = false
|
40
|
+
end
|
41
|
+
|
42
|
+
# Genereate the RDoc documentation
|
43
|
+
desc "Create documentation"
|
44
|
+
Rake::RDocTask.new("doc") do |rdoc|
|
45
|
+
rdoc.title = "pdf-reader"
|
46
|
+
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
47
|
+
rdoc.rdoc_files.include('README')
|
48
|
+
rdoc.rdoc_files.include('TODO')
|
49
|
+
rdoc.rdoc_files.include('CHANGELOG')
|
50
|
+
#rdoc.rdoc_files.include('COPYING')
|
51
|
+
#rdoc.rdoc_files.include('LICENSE')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
rdoc.options << "--inline-source"
|
54
|
+
end
|
55
|
+
|
56
|
+
# a gemspec for packaging this library
|
57
|
+
# RSpec files aren't included, as they depend on the PDF files,
|
58
|
+
# which will make the gem filesize irritatingly large
|
59
|
+
spec = Gem::Specification.new do |spec|
|
60
|
+
spec.name = PKG_NAME
|
61
|
+
spec.version = PKG_VERSION
|
62
|
+
spec.platform = Gem::Platform::RUBY
|
63
|
+
spec.summary = "A library for accessing the content of PDF files"
|
64
|
+
spec.files = Dir.glob("{examples,lib}/**/**/*") +
|
65
|
+
["Rakefile"]
|
66
|
+
|
67
|
+
spec.require_path = "lib"
|
68
|
+
spec.has_rdoc = true
|
69
|
+
spec.extra_rdoc_files = %w{README TODO CHANGELOG}
|
70
|
+
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
71
|
+
'--main' << 'README' << '-q'
|
72
|
+
spec.author = "Peter Jones"
|
73
|
+
spec.email = "pjones@pmade.com"
|
74
|
+
spec.rubyforge_project = "pdf-reader"
|
75
|
+
spec.homepage = "http://software.pmade.com/pdfreader"
|
76
|
+
spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
|
77
|
+
end
|
78
|
+
|
79
|
+
# package the library into a gem
|
80
|
+
desc "Generate a gem for pdf-reader"
|
81
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
82
|
+
pkg.need_zip = true
|
83
|
+
pkg.need_tar = true
|
84
|
+
end
|
data/TODO
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Some ideas for future work
|
2
|
+
- Allows the user to only process certain aspects of the PDF file. For example, if they're only
|
3
|
+
interested in meta data, there's no point in walking the pages tree.
|
4
|
+
|
5
|
+
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
6
|
+
rspec over generated PDF files
|
7
|
+
|
8
|
+
- Improve metadata support
|
9
|
+
|
data/lib/pdf/reader.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
require 'stringio'
|
27
|
+
|
28
|
+
module PDF
|
29
|
+
################################################################################
|
30
|
+
# The Reader class serves as an entry point for parsing a PDF file. There are three
|
31
|
+
# ways to kick off processing - which one you pick will be based on personal preference
|
32
|
+
# and the situation.
|
33
|
+
#
|
34
|
+
# For all examples, assume the receiver variable contains an object that will respond
|
35
|
+
# to various callbacks. Refer to the README and PDF::Reader::Content for more information
|
36
|
+
# on receivers.
|
37
|
+
#
|
38
|
+
# = Parsing a file
|
39
|
+
#
|
40
|
+
# PDF::Reader.file("somefile.pdf", receiver)
|
41
|
+
#
|
42
|
+
# = Parsing a String
|
43
|
+
#
|
44
|
+
# This is useful for processing a PDF that is already in memory
|
45
|
+
#
|
46
|
+
# PDF::Reader.string("somefile.pdf", receiver)
|
47
|
+
#
|
48
|
+
# = Parsing an IO object
|
49
|
+
#
|
50
|
+
# This can be a useful alternative to the first 2 options in some situations
|
51
|
+
#
|
52
|
+
# pdf = PDF::Reader.new
|
53
|
+
# pdf.parse(File.new("somefile.pdf"), receiver)
|
54
|
+
class Reader
|
55
|
+
################################################################################
|
56
|
+
# Parse the file with the given name, sending events to the given receiver.
|
57
|
+
def self.file (name, receiver)
|
58
|
+
File.open(name) do |f|
|
59
|
+
new.parse(f, receiver)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
################################################################################
|
63
|
+
# Parse the given string, sending events to the given receiver.
|
64
|
+
def self.string (str, receiver)
|
65
|
+
StringIO.open(str) do |s|
|
66
|
+
new.parse(s, receiver)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
################################################################################
|
70
|
+
end
|
71
|
+
################################################################################
|
72
|
+
end
|
73
|
+
################################################################################
|
74
|
+
require 'pdf/reader/explore'
|
75
|
+
require 'pdf/reader/buffer'
|
76
|
+
require 'pdf/reader/content'
|
77
|
+
require 'pdf/reader/error'
|
78
|
+
require 'pdf/reader/filter'
|
79
|
+
require 'pdf/reader/name'
|
80
|
+
require 'pdf/reader/parser'
|
81
|
+
require 'pdf/reader/reference'
|
82
|
+
require 'pdf/reader/register_receiver'
|
83
|
+
require 'pdf/reader/text_receiver'
|
84
|
+
require 'pdf/reader/token'
|
85
|
+
require 'pdf/reader/xref'
|
86
|
+
|
87
|
+
|
88
|
+
class PDF::Reader
|
89
|
+
################################################################################
|
90
|
+
# Initialize a new PDF::Reader
|
91
|
+
def initialize
|
92
|
+
end
|
93
|
+
################################################################################
|
94
|
+
# Given an IO object that contains PDF data, parse it.
|
95
|
+
def parse (io, receiver)
|
96
|
+
@buffer = Buffer.new(io)
|
97
|
+
@xref = XRef.new(@buffer)
|
98
|
+
@parser = Parser.new(@buffer, @xref)
|
99
|
+
@content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
|
100
|
+
|
101
|
+
trailer = @xref.load
|
102
|
+
@content.document(@xref.object(trailer['Root'])) || self
|
103
|
+
end
|
104
|
+
################################################################################
|
105
|
+
end
|
106
|
+
################################################################################
|
@@ -0,0 +1,144 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that mediates access to the underlying PDF File or IO Stream
|
29
|
+
class Buffer
|
30
|
+
################################################################################
|
31
|
+
# Creates a new buffer around the specified IO object
|
32
|
+
def initialize (io)
|
33
|
+
@io = io
|
34
|
+
@buffer = nil
|
35
|
+
end
|
36
|
+
################################################################################
|
37
|
+
# Seek to the requested byte in the IO stream.
|
38
|
+
def seek (offset)
|
39
|
+
@io.seek(offset, IO::SEEK_SET)
|
40
|
+
@buffer = nil
|
41
|
+
self
|
42
|
+
end
|
43
|
+
################################################################################
|
44
|
+
# reads the requested number of bytes from the underlying IO stream.
|
45
|
+
#
|
46
|
+
# length should be a positive integer.
|
47
|
+
def read (length)
|
48
|
+
out = ""
|
49
|
+
|
50
|
+
if @buffer and !@buffer.empty?
|
51
|
+
out << head(length)
|
52
|
+
length -= out.length
|
53
|
+
end
|
54
|
+
|
55
|
+
out << @io.read(length) if length > 0
|
56
|
+
out
|
57
|
+
end
|
58
|
+
################################################################################
|
59
|
+
# returns true if the underlying IO object is at end and the internal buffer
|
60
|
+
# is empty
|
61
|
+
def eof?
|
62
|
+
if @buffer
|
63
|
+
@buffer.empty? && @io.eof?
|
64
|
+
else
|
65
|
+
@io.eof?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
################################################################################
|
69
|
+
def pos
|
70
|
+
@io.pos
|
71
|
+
end
|
72
|
+
################################################################################
|
73
|
+
# PDF files are processed by tokenising the content into a series of objects and commands.
|
74
|
+
# This prepares the buffer for use by rerading the next line of tokens into memory.
|
75
|
+
def ready_token (with_strip=true, skip_blanks=true)
|
76
|
+
while @buffer.nil? or @buffer.empty?
|
77
|
+
@buffer = @io.readline
|
78
|
+
@buffer.sub!(/%.*$/, '')
|
79
|
+
@buffer.chomp!
|
80
|
+
@buffer.lstrip! if with_strip
|
81
|
+
break unless skip_blanks
|
82
|
+
end
|
83
|
+
end
|
84
|
+
################################################################################
|
85
|
+
# return the next token from the underlying IO stream
|
86
|
+
def token
|
87
|
+
ready_token
|
88
|
+
|
89
|
+
i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
|
90
|
+
|
91
|
+
token_chars =
|
92
|
+
if i == 0 and @buffer[i,2] == "<<" : 2
|
93
|
+
elsif i == 0 and @buffer[i,2] == ">>" : 2
|
94
|
+
elsif i == 0 : 1
|
95
|
+
else i
|
96
|
+
end
|
97
|
+
|
98
|
+
strip_space = !(i == 0 and @buffer[0,1] == '(')
|
99
|
+
head(token_chars, strip_space)
|
100
|
+
end
|
101
|
+
################################################################################
|
102
|
+
def head (chars, with_strip=true)
|
103
|
+
val = @buffer[0, chars]
|
104
|
+
@buffer = @buffer[chars .. -1] || ""
|
105
|
+
@buffer.lstrip! if with_strip
|
106
|
+
val
|
107
|
+
end
|
108
|
+
################################################################################
|
109
|
+
# return the internal buffer used by this class when reading from the IO stream.
|
110
|
+
def raw
|
111
|
+
@buffer
|
112
|
+
end
|
113
|
+
################################################################################
|
114
|
+
# The Xref table in a PDF file acts as an aid for finding the location of various
|
115
|
+
# objects in the file. This method attempts to locate the byte offset of the xref
|
116
|
+
# table in the underlying IO stream.
|
117
|
+
def find_first_xref_offset
|
118
|
+
@io.seek(-1024, IO::SEEK_END) rescue seek(0)
|
119
|
+
data = @io.read(1024)
|
120
|
+
|
121
|
+
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
122
|
+
# To ensure we find the xref offset correctly, change all possible options to a
|
123
|
+
# standard format
|
124
|
+
data = data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")
|
125
|
+
lines = data.split(/\n/).reverse
|
126
|
+
|
127
|
+
eof_index = nil
|
128
|
+
|
129
|
+
lines.each_with_index do |line, index|
|
130
|
+
if line =~ /^%%EOF\r?$/
|
131
|
+
eof_index = index
|
132
|
+
break
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
137
|
+
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
138
|
+
lines[eof_index+1].to_i
|
139
|
+
end
|
140
|
+
################################################################################
|
141
|
+
end
|
142
|
+
################################################################################
|
143
|
+
end
|
144
|
+
################################################################################
|