fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,220 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that represents the XRef table in a PDF file as a
|
29
|
+
# hash-like object.
|
30
|
+
#
|
31
|
+
# An Xref table is a map of object identifiers and byte offsets. Any time a particular
|
32
|
+
# object needs to be found, the Xref table is used to find where it is stored in the
|
33
|
+
# file.
|
34
|
+
#
|
35
|
+
# Hash keys are object ids, values are either:
|
36
|
+
#
|
37
|
+
# * a byte offset where the object starts (regular PDF objects)
|
38
|
+
# * a PDF::Reader::Reference instance that points to a stream that contains the
|
39
|
+
# desired object (PDF objects embedded in an object stream)
|
40
|
+
#
|
41
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
42
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
43
|
+
# is read only.
|
44
|
+
#
|
45
|
+
class XRef
|
46
|
+
include Enumerable
|
47
|
+
attr_reader :trailer
|
48
|
+
|
49
|
+
################################################################################
|
50
|
+
# create a new Xref table based on the contents of the supplied io object
|
51
|
+
#
|
52
|
+
# io - must be an IO object, generally either a file or a StringIO
|
53
|
+
#
|
54
|
+
def initialize (io)
|
55
|
+
@io = io
|
56
|
+
@xref = {}
|
57
|
+
@trailer = load_offsets
|
58
|
+
end
|
59
|
+
################################################################################
|
60
|
+
# return the number of objects in this file. Objects with multiple generations are
|
61
|
+
# only counter once.
|
62
|
+
def size
|
63
|
+
@xref.size
|
64
|
+
end
|
65
|
+
################################################################################
|
66
|
+
# returns the byte offset for the specified PDF object.
|
67
|
+
#
|
68
|
+
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
69
|
+
def [](ref)
|
70
|
+
@xref[ref.id][ref.gen]
|
71
|
+
rescue
|
72
|
+
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
73
|
+
end
|
74
|
+
################################################################################
|
75
|
+
# iterate over each object in the xref table
|
76
|
+
def each(&block)
|
77
|
+
ids = @xref.keys.sort
|
78
|
+
ids.each do |id|
|
79
|
+
gen = @xref[id].keys.sort[-1]
|
80
|
+
yield PDF::Reader::Reference.new(id, gen)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
################################################################################
|
84
|
+
private
|
85
|
+
################################################################################
|
86
|
+
# Read a xref table from the underlying buffer.
|
87
|
+
#
|
88
|
+
# If offset is specified the table will be loaded from there, otherwise the
|
89
|
+
# default offset will be located and used.
|
90
|
+
#
|
91
|
+
# After seeking to the offset, processing is handed of to either load_xref_table()
|
92
|
+
# or load_xref_stream() based on what we find there.
|
93
|
+
#
|
94
|
+
def load_offsets(offset = nil)
|
95
|
+
offset ||= new_buffer.find_first_xref_offset
|
96
|
+
|
97
|
+
buf = new_buffer(offset)
|
98
|
+
tok_one = buf.token
|
99
|
+
|
100
|
+
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
101
|
+
|
102
|
+
tok_two = buf.token
|
103
|
+
tok_three = buf.token
|
104
|
+
|
105
|
+
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
106
|
+
buf = new_buffer(offset)
|
107
|
+
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
108
|
+
return load_xref_stream(stream)
|
109
|
+
end
|
110
|
+
|
111
|
+
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
|
112
|
+
end
|
113
|
+
################################################################################
|
114
|
+
# Assumes the underlying buffer is positioned at the start of a traditional
|
115
|
+
# Xref table and processes it into memory.
|
116
|
+
def load_xref_table(buf)
|
117
|
+
params = []
|
118
|
+
|
119
|
+
while !params.include?("trailer") && !params.include?(nil)
|
120
|
+
if params.size == 2
|
121
|
+
objid, count = params[0].to_i, params[1].to_i
|
122
|
+
count.times do
|
123
|
+
offset = buf.token.to_i
|
124
|
+
generation = buf.token.to_i
|
125
|
+
state = buf.token
|
126
|
+
|
127
|
+
store(objid, generation, offset) if state == "n"
|
128
|
+
objid += 1
|
129
|
+
params.clear
|
130
|
+
end
|
131
|
+
end
|
132
|
+
params << buf.token
|
133
|
+
end
|
134
|
+
|
135
|
+
trailer = Parser.new(buf, self).parse_token
|
136
|
+
|
137
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
|
138
|
+
|
139
|
+
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
140
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
141
|
+
|
142
|
+
trailer
|
143
|
+
end
|
144
|
+
|
145
|
+
################################################################################
|
146
|
+
# Read a XReaf stream from the underlying buffer instead of a traditional xref table.
|
147
|
+
#
|
148
|
+
def load_xref_stream(stream)
|
149
|
+
unless stream.hash[:Type] == :XRef
|
150
|
+
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
151
|
+
end
|
152
|
+
trailer = Hash[stream.hash.select { |key, value|
|
153
|
+
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
154
|
+
}]
|
155
|
+
|
156
|
+
widths = stream.hash[:W]
|
157
|
+
entry_length = widths.inject(0) { |s, w| s + w }
|
158
|
+
raw_data = StringIO.new(stream.unfiltered_data)
|
159
|
+
if stream.hash[:Index]
|
160
|
+
index = stream.hash[:Index]
|
161
|
+
else
|
162
|
+
index = [0, stream.hash[:Size]]
|
163
|
+
end
|
164
|
+
index.each_slice(2) do |start_id, size|
|
165
|
+
obj_ids = (start_id..(start_id+(size-1)))
|
166
|
+
obj_ids.each do |objid|
|
167
|
+
entry = raw_data.read(entry_length) || ""
|
168
|
+
f1 = unpack_bytes(entry[0,widths[0]])
|
169
|
+
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
170
|
+
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
171
|
+
if f1 == 1 && f2 > 0
|
172
|
+
store(objid, f3, f2)
|
173
|
+
elsif f1 == 2 && f2 > 0
|
174
|
+
store(objid, 0, PDF::Reader::Reference.new(f2, 0))
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
180
|
+
|
181
|
+
trailer
|
182
|
+
end
|
183
|
+
################################################################################
|
184
|
+
# XRef streams pack info into integers 1-N bytes wide. Depending on the number of
|
185
|
+
# bytes they need to be converted to an int in different ways.
|
186
|
+
#
|
187
|
+
def unpack_bytes(bytes)
|
188
|
+
if bytes.to_s.size == 0
|
189
|
+
0
|
190
|
+
elsif bytes.size == 1
|
191
|
+
bytes.unpack("C")[0]
|
192
|
+
elsif bytes.size == 2
|
193
|
+
bytes.unpack("n")[0]
|
194
|
+
elsif bytes.size == 3
|
195
|
+
("\x00" + bytes).unpack("N")[0]
|
196
|
+
elsif bytes.size == 4
|
197
|
+
bytes.unpack("N")[0]
|
198
|
+
else
|
199
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
################################################################################
|
203
|
+
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
204
|
+
#
|
205
|
+
# We create multiple buffers so we can be tokenising multiple sections of the file
|
206
|
+
# at the same time without worring about clearing the buffers contents.
|
207
|
+
#
|
208
|
+
def new_buffer(offset = 0)
|
209
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
210
|
+
end
|
211
|
+
################################################################################
|
212
|
+
# Stores an offset value for a particular PDF object ID and revision number
|
213
|
+
#
|
214
|
+
def store (id, gen, offset)
|
215
|
+
(@xref[id] ||= {})[gen] ||= offset
|
216
|
+
end
|
217
|
+
end
|
218
|
+
################################################################################
|
219
|
+
end
|
220
|
+
################################################################################
|
data/lib/pdf/reader.rb
ADDED
@@ -0,0 +1,296 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
# Copyright (C) 2011 James Healy
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
# a copy of this software and associated documentation files (the
|
8
|
+
# "Software"), to deal in the Software without restriction, including
|
9
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
# the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be
|
15
|
+
# included in all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
21
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
22
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
23
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
#
|
25
|
+
################################################################################
|
26
|
+
|
27
|
+
require 'stringio'
|
28
|
+
require 'zlib'
|
29
|
+
|
30
|
+
require 'ascii85'
|
31
|
+
|
32
|
+
module PDF
|
33
|
+
################################################################################
|
34
|
+
# The Reader class serves as an entry point for parsing a PDF file.
|
35
|
+
#
|
36
|
+
# PDF is a page based file format. There is some data associated with the
|
37
|
+
# document (metadata, bookmarks, etc) but all visible content is stored
|
38
|
+
# under a Page object.
|
39
|
+
#
|
40
|
+
# In most use cases for extracting and examining the contents of a PDF it
|
41
|
+
# makes sense to traverse the information using page based iteration.
|
42
|
+
#
|
43
|
+
# In addition to the documentation here, check out the
|
44
|
+
# PDF::Reader::Page class.
|
45
|
+
#
|
46
|
+
# == File Metadata
|
47
|
+
#
|
48
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
49
|
+
#
|
50
|
+
# puts reader.pdf_version
|
51
|
+
# puts reader.info
|
52
|
+
# puts reader.metadata
|
53
|
+
# puts reader.page_count
|
54
|
+
#
|
55
|
+
# == Iterating over page content
|
56
|
+
#
|
57
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
58
|
+
#
|
59
|
+
# reader.pages.each do |page|
|
60
|
+
# puts page.fonts
|
61
|
+
# puts page.images
|
62
|
+
# puts page.text
|
63
|
+
# end
|
64
|
+
#
|
65
|
+
# == Extracting all text
|
66
|
+
#
|
67
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
68
|
+
#
|
69
|
+
# reader.pages.map(&:text)
|
70
|
+
#
|
71
|
+
# == Extracting content from a single page
|
72
|
+
#
|
73
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
74
|
+
#
|
75
|
+
# page = reader.page(1)
|
76
|
+
# puts page.fonts
|
77
|
+
# puts page.images
|
78
|
+
# puts page.text
|
79
|
+
#
|
80
|
+
# == Low level callbacks (ala current version of PDF::Reader)
|
81
|
+
#
|
82
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
83
|
+
#
|
84
|
+
# page = reader.page(1)
|
85
|
+
# page.walk(receiver)
|
86
|
+
#
|
87
|
+
class Reader
|
88
|
+
|
89
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
90
|
+
attr_reader :objects
|
91
|
+
|
92
|
+
attr_reader :page_count, :pdf_version, :info, :metadata
|
93
|
+
|
94
|
+
# creates a new document reader for the provided PDF.
|
95
|
+
#
|
96
|
+
# input can be an IO-ish object (StringIO, File, etc) containing a PDF
|
97
|
+
# or a filename
|
98
|
+
#
|
99
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
100
|
+
#
|
101
|
+
# File.open("somefile.pdf","rb") do |file|
|
102
|
+
# reader = PDF::Reader.new(file)
|
103
|
+
# end
|
104
|
+
#
|
105
|
+
def initialize(input = nil)
|
106
|
+
if input # support the deprecated Reader API
|
107
|
+
@objects = PDF::Reader::ObjectHash.new(input)
|
108
|
+
@page_count = get_page_count
|
109
|
+
@pdf_version = @objects.pdf_version
|
110
|
+
@info = @objects.deref(@objects.trailer[:Info])
|
111
|
+
@metadata = get_metadata
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# syntactic sugar for opening a PDF file. Accepts the same arguments
|
116
|
+
# as new().
|
117
|
+
#
|
118
|
+
# PDF::Reader.open("somefile.pdf") do |reader|
|
119
|
+
# puts reader.pdf_version
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
def self.open(input, &block)
|
123
|
+
yield PDF::Reader.new(input)
|
124
|
+
end
|
125
|
+
|
126
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
127
|
+
# eventually be removed
|
128
|
+
#
|
129
|
+
#
|
130
|
+
# Parse the file with the given name, sending events to the given receiver.
|
131
|
+
#
|
132
|
+
def self.file(name, receivers, opts = {})
|
133
|
+
File.open(name,"rb") do |f|
|
134
|
+
new.parse(f, receivers, opts)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
139
|
+
# eventually be removed
|
140
|
+
#
|
141
|
+
# Parse the given string, sending events to the given receiver.
|
142
|
+
#
|
143
|
+
def self.string(str, receivers, opts = {})
|
144
|
+
StringIO.open(str) do |s|
|
145
|
+
new.parse(s, receivers, opts)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
150
|
+
# eventually be removed
|
151
|
+
#
|
152
|
+
# Parse the file with the given name, returning an unmarshalled ruby version of
|
153
|
+
# represents the requested pdf object
|
154
|
+
#
|
155
|
+
def self.object_file(name, id, gen = 0)
|
156
|
+
File.open(name,"rb") { |f|
|
157
|
+
new.object(f, id.to_i, gen.to_i)
|
158
|
+
}
|
159
|
+
end
|
160
|
+
|
161
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
162
|
+
# eventually be removed
|
163
|
+
#
|
164
|
+
# Parse the given string, returning an unmarshalled ruby version of represents
|
165
|
+
# the requested pdf object
|
166
|
+
#
|
167
|
+
def self.object_string(str, id, gen = 0)
|
168
|
+
StringIO.open(str) { |s|
|
169
|
+
new.object(s, id.to_i, gen.to_i)
|
170
|
+
}
|
171
|
+
end
|
172
|
+
|
173
|
+
# returns an array of PDF::Reader::Page objects, one for each
|
174
|
+
# page in the source PDF.
|
175
|
+
#
|
176
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
177
|
+
#
|
178
|
+
# reader.pages.each do |page|
|
179
|
+
# puts page.fonts
|
180
|
+
# puts page.images
|
181
|
+
# puts page.text
|
182
|
+
# end
|
183
|
+
#
|
184
|
+
# See the docs for PDF::Reader::Page to read more about the
|
185
|
+
# methods available on each page
|
186
|
+
#
|
187
|
+
def pages
|
188
|
+
(1..@page_count).map { |num|
|
189
|
+
PDF::Reader::Page.new(@objects, num)
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
# returns a single PDF::Reader::Page for the specified page.
|
194
|
+
# Use this instead of pages method when you need to access just a single
|
195
|
+
# page
|
196
|
+
#
|
197
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
198
|
+
# page = reader.page(10)
|
199
|
+
#
|
200
|
+
# puts page.text
|
201
|
+
#
|
202
|
+
# See the docs for PDF::Reader::Page to read more about the
|
203
|
+
# methods available on each page
|
204
|
+
#
|
205
|
+
def page(num)
|
206
|
+
num = num.to_i
|
207
|
+
raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
|
208
|
+
PDF::Reader::Page.new(@objects, num)
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
213
|
+
# eventually be removed
|
214
|
+
#
|
215
|
+
# Given an IO object that contains PDF data, parse it.
|
216
|
+
#
|
217
|
+
def parse(io, receivers, opts = {})
|
218
|
+
ohash = ObjectHash.new(io)
|
219
|
+
|
220
|
+
if ohash.trailer[:Encrypt]
|
221
|
+
raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
222
|
+
end
|
223
|
+
|
224
|
+
options = {:pages => true, :raw_text => false, :metadata => true}
|
225
|
+
options.merge!(opts)
|
226
|
+
|
227
|
+
strategies.each do |s|
|
228
|
+
s.new(ohash, receivers, options).process
|
229
|
+
end
|
230
|
+
|
231
|
+
self
|
232
|
+
end
|
233
|
+
|
234
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
235
|
+
# eventually be removed
|
236
|
+
#
|
237
|
+
# Given an IO object that contains PDF data, return the contents of a single object
|
238
|
+
#
|
239
|
+
def object (io, id, gen)
|
240
|
+
@objects = ObjectHash.new(io)
|
241
|
+
|
242
|
+
@objects.deref(Reference.new(id, gen))
|
243
|
+
end
|
244
|
+
|
245
|
+
private
|
246
|
+
|
247
|
+
def strategies
|
248
|
+
@strategies ||= [
|
249
|
+
::PDF::Reader::MetadataStrategy,
|
250
|
+
::PDF::Reader::PagesStrategy
|
251
|
+
]
|
252
|
+
end
|
253
|
+
|
254
|
+
def root
|
255
|
+
root ||= @objects.deref(@objects.trailer[:Root])
|
256
|
+
end
|
257
|
+
|
258
|
+
def get_metadata
|
259
|
+
stream = @objects.deref(root[:Metadata])
|
260
|
+
stream ? stream.unfiltered_data : nil
|
261
|
+
end
|
262
|
+
|
263
|
+
def get_page_count
|
264
|
+
pages = @objects.deref(root[:Pages])
|
265
|
+
pages[:Count]
|
266
|
+
end
|
267
|
+
|
268
|
+
end
|
269
|
+
end
|
270
|
+
################################################################################
|
271
|
+
|
272
|
+
require 'pdf/reader/abstract_strategy'
|
273
|
+
require 'pdf/reader/buffer'
|
274
|
+
require 'pdf/reader/cmap'
|
275
|
+
require 'pdf/reader/encoding'
|
276
|
+
require 'pdf/reader/error'
|
277
|
+
require 'pdf/reader/filter'
|
278
|
+
require 'pdf/reader/font'
|
279
|
+
require 'pdf/reader/form_xobject'
|
280
|
+
require 'pdf/reader/lzw'
|
281
|
+
require 'pdf/reader/metadata_strategy'
|
282
|
+
require 'pdf/reader/object_cache'
|
283
|
+
require 'pdf/reader/object_hash'
|
284
|
+
require 'pdf/reader/object_stream'
|
285
|
+
require 'pdf/reader/pages_strategy'
|
286
|
+
require 'pdf/reader/parser'
|
287
|
+
require 'pdf/reader/print_receiver'
|
288
|
+
require 'pdf/reader/reference'
|
289
|
+
require 'pdf/reader/register_receiver'
|
290
|
+
require 'pdf/reader/stream'
|
291
|
+
require 'pdf/reader/text_receiver'
|
292
|
+
require 'pdf/reader/page_text_receiver'
|
293
|
+
require 'pdf/reader/token'
|
294
|
+
require 'pdf/reader/xref'
|
295
|
+
require 'pdf/reader/page'
|
296
|
+
require 'pdf/hash'
|
data/lib/pdf-reader.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "pdf/reader"
|