pdf-reader 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/README +177 -0
- data/Rakefile +84 -0
- data/TODO +9 -0
- data/lib/pdf/reader.rb +106 -0
- data/lib/pdf/reader/buffer.rb +144 -0
- data/lib/pdf/reader/content.rb +289 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/explore.rb +116 -0
- data/lib/pdf/reader/filter.rb +62 -0
- data/lib/pdf/reader/name.rb +37 -0
- data/lib/pdf/reader/parser.rb +203 -0
- data/lib/pdf/reader/reference.rb +55 -0
- data/lib/pdf/reader/register_receiver.rb +18 -0
- data/lib/pdf/reader/text_receiver.rb +259 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +101 -0
- metadata +70 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
The PDF::Reader library implements a PDF parser conforming as much as possible
|
2
|
+
to the PDF specification from Adobe.
|
3
|
+
|
4
|
+
It provides programmatic access to the contents of a PDF file with a high
|
5
|
+
degree of flexibility.
|
6
|
+
|
7
|
+
The PDF 1.7 specification is a weighty document and not all aspects are
|
8
|
+
currently supported. We welcome submission of PDF files that exhibit
|
9
|
+
unsupported aspects of the spec to assist with improving out support.
|
10
|
+
|
11
|
+
= Installation
|
12
|
+
|
13
|
+
The recommended installation method is via Rubygems.
|
14
|
+
|
15
|
+
gem install pdf-reader
|
16
|
+
|
17
|
+
= Usage
|
18
|
+
|
19
|
+
PDF::Reader is designed with a callback-style architecture. The basic concept
|
20
|
+
is to build a receiver class and pass that into PDF::Reader along with the PDF
|
21
|
+
to process.
|
22
|
+
|
23
|
+
As PDF::Reader walks the file and encounters various objects (pages, text,
|
24
|
+
images, shapes, etc) it will call methods on the receiver class. What those
|
25
|
+
methods do is entirely up to you - save the text, extract images, count pages,
|
26
|
+
read metadata, whatever.
|
27
|
+
|
28
|
+
For a full list of the supported callback methods and a description of when they
|
29
|
+
will be called, refer to PDF::Reader::Content.
|
30
|
+
|
31
|
+
= Exceptions
|
32
|
+
|
33
|
+
There are two key exceptions that you will need to watch out for when processing a
|
34
|
+
PDF file:
|
35
|
+
|
36
|
+
MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
|
37
|
+
file should be valid, or that a corrupt file didn't raise an exception, please
|
38
|
+
forward a copy of the file to the maintainers and we can attempt improve the code.
|
39
|
+
|
40
|
+
UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
|
41
|
+
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
42
|
+
us with future code improvements.
|
43
|
+
|
44
|
+
= Maintainers
|
45
|
+
|
46
|
+
- Peter Jones <mailto:pjones@pmade.com>
|
47
|
+
- James Healy <mailto:jimmy@deefa.com>
|
48
|
+
|
49
|
+
= Examples
|
50
|
+
|
51
|
+
The easiest way to explain how this works in practice is to show some examples.
|
52
|
+
|
53
|
+
== Page Counter
|
54
|
+
|
55
|
+
A simple app to count the number of pages in a PDF File.
|
56
|
+
|
57
|
+
require 'rubygems'
|
58
|
+
require 'pdf/reader'
|
59
|
+
|
60
|
+
class PageReceiver
|
61
|
+
attr_accessor :page_count
|
62
|
+
|
63
|
+
def initialize
|
64
|
+
@page_count = 0
|
65
|
+
end
|
66
|
+
|
67
|
+
# Called when page parsing ends
|
68
|
+
def end_page
|
69
|
+
@page_count += 1
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
receiver = PageReceiver.new
|
74
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
75
|
+
puts "#{pdf.page_count} pages"
|
76
|
+
|
77
|
+
== Basic RSpec of a generated PDF
|
78
|
+
|
79
|
+
require 'rubygems'
|
80
|
+
require 'pdf/reader'
|
81
|
+
require 'pdf/writer'
|
82
|
+
require 'spec'
|
83
|
+
|
84
|
+
class PageTextReceiver
|
85
|
+
attr_accessor :content
|
86
|
+
|
87
|
+
def initialize
|
88
|
+
@content = []
|
89
|
+
end
|
90
|
+
|
91
|
+
# Called when page parsing starts
|
92
|
+
def begin_page(arg = nil)
|
93
|
+
@content << ""
|
94
|
+
end
|
95
|
+
|
96
|
+
def show_text(string, *params)
|
97
|
+
@content.last << string.strip
|
98
|
+
end
|
99
|
+
|
100
|
+
# there's a few text callbacks, so make sure we process them all
|
101
|
+
alias :super_show_text :show_text
|
102
|
+
alias :move_to_next_line_and_show_text :show_text
|
103
|
+
alias :set_spacing_next_line_show_text :show_text
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
context "My generated PDF" do
|
108
|
+
specify "should have the correct text on 2 pages" do
|
109
|
+
|
110
|
+
# generate our PDF
|
111
|
+
pdf = PDF::Writer.new
|
112
|
+
pdf.text "Chunky", :font_size => 32, :justification => :center
|
113
|
+
pdf.start_new_page
|
114
|
+
pdf.text "Bacon", :font_size => 32, :justification => :center
|
115
|
+
pdf.save_as("chunkybacon.pdf")
|
116
|
+
|
117
|
+
# process the PDF
|
118
|
+
receiver = PageTextReceiver.new
|
119
|
+
PDF::Reader.file("chunkybacon.pdf", receiver)
|
120
|
+
|
121
|
+
# confirm the text appears on the correct pages
|
122
|
+
receiver.content.size.should eql(2)
|
123
|
+
receiver.content[0].should eql("Chunky")
|
124
|
+
receiver.content[1].should eql("Bacon")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
== Extract ISBNs
|
129
|
+
|
130
|
+
Parse all text in the requested PDF file and print out any valid book ISBNs.
|
131
|
+
Requires the rbook-isbn gem.
|
132
|
+
|
133
|
+
require 'rubygems'
|
134
|
+
require 'pdf/reader'
|
135
|
+
require 'rbook/isbn'
|
136
|
+
|
137
|
+
class ISBNReceiver
|
138
|
+
|
139
|
+
# there's a few text callbacks, so make sure we process them all
|
140
|
+
def show_text(string, *params)
|
141
|
+
process_words(string.split(/\W+/))
|
142
|
+
end
|
143
|
+
|
144
|
+
def super_show_text(string, *params)
|
145
|
+
process_words(string.split(/\W+/))
|
146
|
+
end
|
147
|
+
|
148
|
+
def move_to_next_line_and_show_text (string)
|
149
|
+
process_words(string.split(/\W+/))
|
150
|
+
end
|
151
|
+
|
152
|
+
def set_spacing_next_line_show_text (aw, ac, string)
|
153
|
+
process_words(string.split(/\W+/))
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
# check if any items in the supplied array are a valid ISBN, and print any
|
159
|
+
# that are to console
|
160
|
+
def process_words(words)
|
161
|
+
words.each do |word|
|
162
|
+
word.strip!
|
163
|
+
puts "#{RBook::ISBN.convert_to_isbn13(word)}" if RBook::ISBN.valid_isbn?(word)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
receiver = ISBNReceiver.new
|
169
|
+
PDF::Reader.file("somefile.pdf", receiver)
|
170
|
+
|
171
|
+
|
172
|
+
= Resources
|
173
|
+
|
174
|
+
- PDF::Reader Homepage: http://software.pmade.com/pdfreader
|
175
|
+
- PDF::Reader Rubyforge Page: http://rubyforge.org/projects/pdf-reader/
|
176
|
+
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
177
|
+
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/Rakefile
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require 'rake'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/rdoctask'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require "rake/gempackagetask"
|
7
|
+
require 'spec/rake/spectask'
|
8
|
+
|
9
|
+
PKG_VERSION = "0.5"
|
10
|
+
PKG_NAME = "pdf-reader"
|
11
|
+
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
|
+
|
13
|
+
desc "Default Task"
|
14
|
+
task :default => [ :spec ]
|
15
|
+
|
16
|
+
# run all rspecs
|
17
|
+
desc "Run all rspec files"
|
18
|
+
Spec::Rake::SpecTask.new("spec") do |t|
|
19
|
+
t.spec_files = FileList['specs/**/*.rb']
|
20
|
+
t.rcov = true
|
21
|
+
t.rcov_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + "/rcov"
|
22
|
+
# t.rcov_opts = ["--exclude","spec.*\.rb"]
|
23
|
+
end
|
24
|
+
|
25
|
+
# generate specdocs
|
26
|
+
desc "Generate Specdocs"
|
27
|
+
Spec::Rake::SpecTask.new("specdocs") do |t|
|
28
|
+
t.spec_files = FileList['specs/**/*.rb']
|
29
|
+
t.spec_opts = ["--format", "rdoc"]
|
30
|
+
t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/specdoc.rd'
|
31
|
+
end
|
32
|
+
|
33
|
+
# generate failing spec report
|
34
|
+
desc "Generate failing spec report"
|
35
|
+
Spec::Rake::SpecTask.new("spec_report") do |t|
|
36
|
+
t.spec_files = FileList['specs/**/*.rb']
|
37
|
+
t.spec_opts = ["--format", "html", "--diff"]
|
38
|
+
t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/spec_report.html'
|
39
|
+
t.fail_on_error = false
|
40
|
+
end
|
41
|
+
|
42
|
+
# Genereate the RDoc documentation
|
43
|
+
desc "Create documentation"
|
44
|
+
Rake::RDocTask.new("doc") do |rdoc|
|
45
|
+
rdoc.title = "pdf-reader"
|
46
|
+
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
47
|
+
rdoc.rdoc_files.include('README')
|
48
|
+
rdoc.rdoc_files.include('TODO')
|
49
|
+
rdoc.rdoc_files.include('CHANGELOG')
|
50
|
+
#rdoc.rdoc_files.include('COPYING')
|
51
|
+
#rdoc.rdoc_files.include('LICENSE')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
rdoc.options << "--inline-source"
|
54
|
+
end
|
55
|
+
|
56
|
+
# a gemspec for packaging this library
|
57
|
+
# RSpec files aren't included, as they depend on the PDF files,
|
58
|
+
# which will make the gem filesize irritatingly large
|
59
|
+
spec = Gem::Specification.new do |spec|
|
60
|
+
spec.name = PKG_NAME
|
61
|
+
spec.version = PKG_VERSION
|
62
|
+
spec.platform = Gem::Platform::RUBY
|
63
|
+
spec.summary = "A library for accessing the content of PDF files"
|
64
|
+
spec.files = Dir.glob("{examples,lib}/**/**/*") +
|
65
|
+
["Rakefile"]
|
66
|
+
|
67
|
+
spec.require_path = "lib"
|
68
|
+
spec.has_rdoc = true
|
69
|
+
spec.extra_rdoc_files = %w{README TODO CHANGELOG}
|
70
|
+
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
71
|
+
'--main' << 'README' << '-q'
|
72
|
+
spec.author = "Peter Jones"
|
73
|
+
spec.email = "pjones@pmade.com"
|
74
|
+
spec.rubyforge_project = "pdf-reader"
|
75
|
+
spec.homepage = "http://software.pmade.com/pdfreader"
|
76
|
+
spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
|
77
|
+
end
|
78
|
+
|
79
|
+
# package the library into a gem
|
80
|
+
desc "Generate a gem for pdf-reader"
|
81
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
82
|
+
pkg.need_zip = true
|
83
|
+
pkg.need_tar = true
|
84
|
+
end
|
data/TODO
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Some ideas for future work
|
2
|
+
- Allows the user to only process certain aspects of the PDF file. For example, if they're only
|
3
|
+
interested in meta data, there's no point in walking the pages tree.
|
4
|
+
|
5
|
+
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
6
|
+
rspec over generated PDF files
|
7
|
+
|
8
|
+
- Improve metadata support
|
9
|
+
|
data/lib/pdf/reader.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
require 'stringio'
|
27
|
+
|
28
|
+
module PDF
|
29
|
+
################################################################################
|
30
|
+
# The Reader class serves as an entry point for parsing a PDF file. There are three
|
31
|
+
# ways to kick off processing - which one you pick will be based on personal preference
|
32
|
+
# and the situation.
|
33
|
+
#
|
34
|
+
# For all examples, assume the receiver variable contains an object that will respond
|
35
|
+
# to various callbacks. Refer to the README and PDF::Reader::Content for more information
|
36
|
+
# on receivers.
|
37
|
+
#
|
38
|
+
# = Parsing a file
|
39
|
+
#
|
40
|
+
# PDF::Reader.file("somefile.pdf", receiver)
|
41
|
+
#
|
42
|
+
# = Parsing a String
|
43
|
+
#
|
44
|
+
# This is useful for processing a PDF that is already in memory
|
45
|
+
#
|
46
|
+
# PDF::Reader.string("somefile.pdf", receiver)
|
47
|
+
#
|
48
|
+
# = Parsing an IO object
|
49
|
+
#
|
50
|
+
# This can be a useful alternative to the first 2 options in some situations
|
51
|
+
#
|
52
|
+
# pdf = PDF::Reader.new
|
53
|
+
# pdf.parse(File.new("somefile.pdf"), receiver)
|
54
|
+
class Reader
|
55
|
+
################################################################################
|
56
|
+
# Parse the file with the given name, sending events to the given receiver.
|
57
|
+
def self.file (name, receiver)
|
58
|
+
File.open(name) do |f|
|
59
|
+
new.parse(f, receiver)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
################################################################################
|
63
|
+
# Parse the given string, sending events to the given receiver.
|
64
|
+
def self.string (str, receiver)
|
65
|
+
StringIO.open(str) do |s|
|
66
|
+
new.parse(s, receiver)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
################################################################################
|
70
|
+
end
|
71
|
+
################################################################################
|
72
|
+
end
|
73
|
+
################################################################################
|
74
|
+
require 'pdf/reader/explore'
|
75
|
+
require 'pdf/reader/buffer'
|
76
|
+
require 'pdf/reader/content'
|
77
|
+
require 'pdf/reader/error'
|
78
|
+
require 'pdf/reader/filter'
|
79
|
+
require 'pdf/reader/name'
|
80
|
+
require 'pdf/reader/parser'
|
81
|
+
require 'pdf/reader/reference'
|
82
|
+
require 'pdf/reader/register_receiver'
|
83
|
+
require 'pdf/reader/text_receiver'
|
84
|
+
require 'pdf/reader/token'
|
85
|
+
require 'pdf/reader/xref'
|
86
|
+
|
87
|
+
|
88
|
+
class PDF::Reader
|
89
|
+
################################################################################
|
90
|
+
# Initialize a new PDF::Reader
|
91
|
+
def initialize
|
92
|
+
end
|
93
|
+
################################################################################
|
94
|
+
# Given an IO object that contains PDF data, parse it.
|
95
|
+
def parse (io, receiver)
|
96
|
+
@buffer = Buffer.new(io)
|
97
|
+
@xref = XRef.new(@buffer)
|
98
|
+
@parser = Parser.new(@buffer, @xref)
|
99
|
+
@content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
|
100
|
+
|
101
|
+
trailer = @xref.load
|
102
|
+
@content.document(@xref.object(trailer['Root'])) || self
|
103
|
+
end
|
104
|
+
################################################################################
|
105
|
+
end
|
106
|
+
################################################################################
|
@@ -0,0 +1,144 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that mediates access to the underlying PDF File or IO Stream
|
29
|
+
class Buffer
|
30
|
+
################################################################################
|
31
|
+
# Creates a new buffer around the specified IO object
|
32
|
+
def initialize (io)
|
33
|
+
@io = io
|
34
|
+
@buffer = nil
|
35
|
+
end
|
36
|
+
################################################################################
|
37
|
+
# Seek to the requested byte in the IO stream.
|
38
|
+
def seek (offset)
|
39
|
+
@io.seek(offset, IO::SEEK_SET)
|
40
|
+
@buffer = nil
|
41
|
+
self
|
42
|
+
end
|
43
|
+
################################################################################
|
44
|
+
# reads the requested number of bytes from the underlying IO stream.
|
45
|
+
#
|
46
|
+
# length should be a positive integer.
|
47
|
+
def read (length)
|
48
|
+
out = ""
|
49
|
+
|
50
|
+
if @buffer and !@buffer.empty?
|
51
|
+
out << head(length)
|
52
|
+
length -= out.length
|
53
|
+
end
|
54
|
+
|
55
|
+
out << @io.read(length) if length > 0
|
56
|
+
out
|
57
|
+
end
|
58
|
+
################################################################################
|
59
|
+
# returns true if the underlying IO object is at end and the internal buffer
|
60
|
+
# is empty
|
61
|
+
def eof?
|
62
|
+
if @buffer
|
63
|
+
@buffer.empty? && @io.eof?
|
64
|
+
else
|
65
|
+
@io.eof?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
################################################################################
|
69
|
+
def pos
|
70
|
+
@io.pos
|
71
|
+
end
|
72
|
+
################################################################################
|
73
|
+
# PDF files are processed by tokenising the content into a series of objects and commands.
|
74
|
+
# This prepares the buffer for use by rerading the next line of tokens into memory.
|
75
|
+
def ready_token (with_strip=true, skip_blanks=true)
|
76
|
+
while @buffer.nil? or @buffer.empty?
|
77
|
+
@buffer = @io.readline
|
78
|
+
@buffer.sub!(/%.*$/, '')
|
79
|
+
@buffer.chomp!
|
80
|
+
@buffer.lstrip! if with_strip
|
81
|
+
break unless skip_blanks
|
82
|
+
end
|
83
|
+
end
|
84
|
+
################################################################################
|
85
|
+
# return the next token from the underlying IO stream
|
86
|
+
def token
|
87
|
+
ready_token
|
88
|
+
|
89
|
+
i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
|
90
|
+
|
91
|
+
token_chars =
|
92
|
+
if i == 0 and @buffer[i,2] == "<<" : 2
|
93
|
+
elsif i == 0 and @buffer[i,2] == ">>" : 2
|
94
|
+
elsif i == 0 : 1
|
95
|
+
else i
|
96
|
+
end
|
97
|
+
|
98
|
+
strip_space = !(i == 0 and @buffer[0,1] == '(')
|
99
|
+
head(token_chars, strip_space)
|
100
|
+
end
|
101
|
+
################################################################################
|
102
|
+
def head (chars, with_strip=true)
|
103
|
+
val = @buffer[0, chars]
|
104
|
+
@buffer = @buffer[chars .. -1] || ""
|
105
|
+
@buffer.lstrip! if with_strip
|
106
|
+
val
|
107
|
+
end
|
108
|
+
################################################################################
|
109
|
+
# return the internal buffer used by this class when reading from the IO stream.
|
110
|
+
def raw
|
111
|
+
@buffer
|
112
|
+
end
|
113
|
+
################################################################################
|
114
|
+
# The Xref table in a PDF file acts as an aid for finding the location of various
|
115
|
+
# objects in the file. This method attempts to locate the byte offset of the xref
|
116
|
+
# table in the underlying IO stream.
|
117
|
+
def find_first_xref_offset
|
118
|
+
@io.seek(-1024, IO::SEEK_END) rescue seek(0)
|
119
|
+
data = @io.read(1024)
|
120
|
+
|
121
|
+
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
122
|
+
# To ensure we find the xref offset correctly, change all possible options to a
|
123
|
+
# standard format
|
124
|
+
data = data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")
|
125
|
+
lines = data.split(/\n/).reverse
|
126
|
+
|
127
|
+
eof_index = nil
|
128
|
+
|
129
|
+
lines.each_with_index do |line, index|
|
130
|
+
if line =~ /^%%EOF\r?$/
|
131
|
+
eof_index = index
|
132
|
+
break
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
137
|
+
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
138
|
+
lines[eof_index+1].to_i
|
139
|
+
end
|
140
|
+
################################################################################
|
141
|
+
end
|
142
|
+
################################################################################
|
143
|
+
end
|
144
|
+
################################################################################
|