fireinc-pdf-reader 0.11.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,220 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that represents the XRef table in a PDF file as a
|
29
|
+
# hash-like object.
|
30
|
+
#
|
31
|
+
# An Xref table is a map of object identifiers and byte offsets. Any time a particular
|
32
|
+
# object needs to be found, the Xref table is used to find where it is stored in the
|
33
|
+
# file.
|
34
|
+
#
|
35
|
+
# Hash keys are object ids, values are either:
|
36
|
+
#
|
37
|
+
# * a byte offset where the object starts (regular PDF objects)
|
38
|
+
# * a PDF::Reader::Reference instance that points to a stream that contains the
|
39
|
+
# desired object (PDF objects embedded in an object stream)
|
40
|
+
#
|
41
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
42
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
43
|
+
# is read only.
|
44
|
+
#
|
45
|
+
class XRef
|
46
|
+
include Enumerable
|
47
|
+
attr_reader :trailer
|
48
|
+
|
49
|
+
################################################################################
|
50
|
+
# create a new Xref table based on the contents of the supplied io object
|
51
|
+
#
|
52
|
+
# io - must be an IO object, generally either a file or a StringIO
|
53
|
+
#
|
54
|
+
def initialize (io)
|
55
|
+
@io = io
|
56
|
+
@xref = {}
|
57
|
+
@trailer = load_offsets
|
58
|
+
end
|
59
|
+
################################################################################
|
60
|
+
# return the number of objects in this file. Objects with multiple generations are
|
61
|
+
# only counter once.
|
62
|
+
def size
|
63
|
+
@xref.size
|
64
|
+
end
|
65
|
+
################################################################################
|
66
|
+
# returns the byte offset for the specified PDF object.
|
67
|
+
#
|
68
|
+
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
69
|
+
def [](ref)
|
70
|
+
@xref[ref.id][ref.gen]
|
71
|
+
rescue
|
72
|
+
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
73
|
+
end
|
74
|
+
################################################################################
|
75
|
+
# iterate over each object in the xref table
|
76
|
+
def each(&block)
|
77
|
+
ids = @xref.keys.sort
|
78
|
+
ids.each do |id|
|
79
|
+
gen = @xref[id].keys.sort[-1]
|
80
|
+
yield PDF::Reader::Reference.new(id, gen)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
################################################################################
|
84
|
+
private
|
85
|
+
################################################################################
|
86
|
+
# Read a xref table from the underlying buffer.
|
87
|
+
#
|
88
|
+
# If offset is specified the table will be loaded from there, otherwise the
|
89
|
+
# default offset will be located and used.
|
90
|
+
#
|
91
|
+
# After seeking to the offset, processing is handed of to either load_xref_table()
|
92
|
+
# or load_xref_stream() based on what we find there.
|
93
|
+
#
|
94
|
+
def load_offsets(offset = nil)
|
95
|
+
offset ||= new_buffer.find_first_xref_offset
|
96
|
+
|
97
|
+
buf = new_buffer(offset)
|
98
|
+
tok_one = buf.token
|
99
|
+
|
100
|
+
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
101
|
+
|
102
|
+
tok_two = buf.token
|
103
|
+
tok_three = buf.token
|
104
|
+
|
105
|
+
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
106
|
+
buf = new_buffer(offset)
|
107
|
+
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
108
|
+
return load_xref_stream(stream)
|
109
|
+
end
|
110
|
+
|
111
|
+
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
|
112
|
+
end
|
113
|
+
################################################################################
|
114
|
+
# Assumes the underlying buffer is positioned at the start of a traditional
|
115
|
+
# Xref table and processes it into memory.
|
116
|
+
def load_xref_table(buf)
|
117
|
+
params = []
|
118
|
+
|
119
|
+
while !params.include?("trailer") && !params.include?(nil)
|
120
|
+
if params.size == 2
|
121
|
+
objid, count = params[0].to_i, params[1].to_i
|
122
|
+
count.times do
|
123
|
+
offset = buf.token.to_i
|
124
|
+
generation = buf.token.to_i
|
125
|
+
state = buf.token
|
126
|
+
|
127
|
+
store(objid, generation, offset) if state == "n"
|
128
|
+
objid += 1
|
129
|
+
params.clear
|
130
|
+
end
|
131
|
+
end
|
132
|
+
params << buf.token
|
133
|
+
end
|
134
|
+
|
135
|
+
trailer = Parser.new(buf, self).parse_token
|
136
|
+
|
137
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
|
138
|
+
|
139
|
+
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
140
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
141
|
+
|
142
|
+
trailer
|
143
|
+
end
|
144
|
+
|
145
|
+
################################################################################
|
146
|
+
# Read a XReaf stream from the underlying buffer instead of a traditional xref table.
|
147
|
+
#
|
148
|
+
def load_xref_stream(stream)
|
149
|
+
unless stream.hash[:Type] == :XRef
|
150
|
+
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
151
|
+
end
|
152
|
+
trailer = Hash[stream.hash.select { |key, value|
|
153
|
+
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
154
|
+
}]
|
155
|
+
|
156
|
+
widths = stream.hash[:W]
|
157
|
+
entry_length = widths.inject(0) { |s, w| s + w }
|
158
|
+
raw_data = StringIO.new(stream.unfiltered_data)
|
159
|
+
if stream.hash[:Index]
|
160
|
+
index = stream.hash[:Index]
|
161
|
+
else
|
162
|
+
index = [0, stream.hash[:Size]]
|
163
|
+
end
|
164
|
+
index.each_slice(2) do |start_id, size|
|
165
|
+
obj_ids = (start_id..(start_id+(size-1)))
|
166
|
+
obj_ids.each do |objid|
|
167
|
+
entry = raw_data.read(entry_length) || ""
|
168
|
+
f1 = unpack_bytes(entry[0,widths[0]])
|
169
|
+
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
170
|
+
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
171
|
+
if f1 == 1 && f2 > 0
|
172
|
+
store(objid, f3, f2)
|
173
|
+
elsif f1 == 2 && f2 > 0
|
174
|
+
store(objid, 0, PDF::Reader::Reference.new(f2, 0))
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
180
|
+
|
181
|
+
trailer
|
182
|
+
end
|
183
|
+
################################################################################
|
184
|
+
# XRef streams pack info into integers 1-N bytes wide. Depending on the number of
|
185
|
+
# bytes they need to be converted to an int in different ways.
|
186
|
+
#
|
187
|
+
def unpack_bytes(bytes)
|
188
|
+
if bytes.to_s.size == 0
|
189
|
+
0
|
190
|
+
elsif bytes.size == 1
|
191
|
+
bytes.unpack("C")[0]
|
192
|
+
elsif bytes.size == 2
|
193
|
+
bytes.unpack("n")[0]
|
194
|
+
elsif bytes.size == 3
|
195
|
+
("\x00" + bytes).unpack("N")[0]
|
196
|
+
elsif bytes.size == 4
|
197
|
+
bytes.unpack("N")[0]
|
198
|
+
else
|
199
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
################################################################################
|
203
|
+
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
204
|
+
#
|
205
|
+
# We create multiple buffers so we can be tokenising multiple sections of the file
|
206
|
+
# at the same time without worring about clearing the buffers contents.
|
207
|
+
#
|
208
|
+
def new_buffer(offset = 0)
|
209
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
210
|
+
end
|
211
|
+
################################################################################
|
212
|
+
# Stores an offset value for a particular PDF object ID and revision number
|
213
|
+
#
|
214
|
+
def store (id, gen, offset)
|
215
|
+
(@xref[id] ||= {})[gen] ||= offset
|
216
|
+
end
|
217
|
+
end
|
218
|
+
################################################################################
|
219
|
+
end
|
220
|
+
################################################################################
|
data/lib/pdf/reader.rb
ADDED
@@ -0,0 +1,296 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
# Copyright (C) 2011 James Healy
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
# a copy of this software and associated documentation files (the
|
8
|
+
# "Software"), to deal in the Software without restriction, including
|
9
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
# the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be
|
15
|
+
# included in all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
20
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
21
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
22
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
23
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
#
|
25
|
+
################################################################################
|
26
|
+
|
27
|
+
require 'stringio'
|
28
|
+
require 'zlib'
|
29
|
+
|
30
|
+
require 'ascii85'
|
31
|
+
|
32
|
+
module PDF
|
33
|
+
################################################################################
|
34
|
+
# The Reader class serves as an entry point for parsing a PDF file.
|
35
|
+
#
|
36
|
+
# PDF is a page based file format. There is some data associated with the
|
37
|
+
# document (metadata, bookmarks, etc) but all visible content is stored
|
38
|
+
# under a Page object.
|
39
|
+
#
|
40
|
+
# In most use cases for extracting and examining the contents of a PDF it
|
41
|
+
# makes sense to traverse the information using page based iteration.
|
42
|
+
#
|
43
|
+
# In addition to the documentation here, check out the
|
44
|
+
# PDF::Reader::Page class.
|
45
|
+
#
|
46
|
+
# == File Metadata
|
47
|
+
#
|
48
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
49
|
+
#
|
50
|
+
# puts reader.pdf_version
|
51
|
+
# puts reader.info
|
52
|
+
# puts reader.metadata
|
53
|
+
# puts reader.page_count
|
54
|
+
#
|
55
|
+
# == Iterating over page content
|
56
|
+
#
|
57
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
58
|
+
#
|
59
|
+
# reader.pages.each do |page|
|
60
|
+
# puts page.fonts
|
61
|
+
# puts page.images
|
62
|
+
# puts page.text
|
63
|
+
# end
|
64
|
+
#
|
65
|
+
# == Extracting all text
|
66
|
+
#
|
67
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
68
|
+
#
|
69
|
+
# reader.pages.map(&:text)
|
70
|
+
#
|
71
|
+
# == Extracting content from a single page
|
72
|
+
#
|
73
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
74
|
+
#
|
75
|
+
# page = reader.page(1)
|
76
|
+
# puts page.fonts
|
77
|
+
# puts page.images
|
78
|
+
# puts page.text
|
79
|
+
#
|
80
|
+
# == Low level callbacks (ala current version of PDF::Reader)
|
81
|
+
#
|
82
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
83
|
+
#
|
84
|
+
# page = reader.page(1)
|
85
|
+
# page.walk(receiver)
|
86
|
+
#
|
87
|
+
class Reader
|
88
|
+
|
89
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
90
|
+
attr_reader :objects
|
91
|
+
|
92
|
+
attr_reader :page_count, :pdf_version, :info, :metadata
|
93
|
+
|
94
|
+
# creates a new document reader for the provided PDF.
|
95
|
+
#
|
96
|
+
# input can be an IO-ish object (StringIO, File, etc) containing a PDF
|
97
|
+
# or a filename
|
98
|
+
#
|
99
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
100
|
+
#
|
101
|
+
# File.open("somefile.pdf","rb") do |file|
|
102
|
+
# reader = PDF::Reader.new(file)
|
103
|
+
# end
|
104
|
+
#
|
105
|
+
def initialize(input = nil)
|
106
|
+
if input # support the deprecated Reader API
|
107
|
+
@objects = PDF::Reader::ObjectHash.new(input)
|
108
|
+
@page_count = get_page_count
|
109
|
+
@pdf_version = @objects.pdf_version
|
110
|
+
@info = @objects.deref(@objects.trailer[:Info])
|
111
|
+
@metadata = get_metadata
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# syntactic sugar for opening a PDF file. Accepts the same arguments
|
116
|
+
# as new().
|
117
|
+
#
|
118
|
+
# PDF::Reader.open("somefile.pdf") do |reader|
|
119
|
+
# puts reader.pdf_version
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
def self.open(input, &block)
|
123
|
+
yield PDF::Reader.new(input)
|
124
|
+
end
|
125
|
+
|
126
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
127
|
+
# eventually be removed
|
128
|
+
#
|
129
|
+
#
|
130
|
+
# Parse the file with the given name, sending events to the given receiver.
|
131
|
+
#
|
132
|
+
def self.file(name, receivers, opts = {})
|
133
|
+
File.open(name,"rb") do |f|
|
134
|
+
new.parse(f, receivers, opts)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
139
|
+
# eventually be removed
|
140
|
+
#
|
141
|
+
# Parse the given string, sending events to the given receiver.
|
142
|
+
#
|
143
|
+
def self.string(str, receivers, opts = {})
|
144
|
+
StringIO.open(str) do |s|
|
145
|
+
new.parse(s, receivers, opts)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
150
|
+
# eventually be removed
|
151
|
+
#
|
152
|
+
# Parse the file with the given name, returning an unmarshalled ruby version of
|
153
|
+
# represents the requested pdf object
|
154
|
+
#
|
155
|
+
def self.object_file(name, id, gen = 0)
|
156
|
+
File.open(name,"rb") { |f|
|
157
|
+
new.object(f, id.to_i, gen.to_i)
|
158
|
+
}
|
159
|
+
end
|
160
|
+
|
161
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
162
|
+
# eventually be removed
|
163
|
+
#
|
164
|
+
# Parse the given string, returning an unmarshalled ruby version of represents
|
165
|
+
# the requested pdf object
|
166
|
+
#
|
167
|
+
def self.object_string(str, id, gen = 0)
|
168
|
+
StringIO.open(str) { |s|
|
169
|
+
new.object(s, id.to_i, gen.to_i)
|
170
|
+
}
|
171
|
+
end
|
172
|
+
|
173
|
+
# returns an array of PDF::Reader::Page objects, one for each
|
174
|
+
# page in the source PDF.
|
175
|
+
#
|
176
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
177
|
+
#
|
178
|
+
# reader.pages.each do |page|
|
179
|
+
# puts page.fonts
|
180
|
+
# puts page.images
|
181
|
+
# puts page.text
|
182
|
+
# end
|
183
|
+
#
|
184
|
+
# See the docs for PDF::Reader::Page to read more about the
|
185
|
+
# methods available on each page
|
186
|
+
#
|
187
|
+
def pages
|
188
|
+
(1..@page_count).map { |num|
|
189
|
+
PDF::Reader::Page.new(@objects, num)
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
# returns a single PDF::Reader::Page for the specified page.
|
194
|
+
# Use this instead of pages method when you need to access just a single
|
195
|
+
# page
|
196
|
+
#
|
197
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
198
|
+
# page = reader.page(10)
|
199
|
+
#
|
200
|
+
# puts page.text
|
201
|
+
#
|
202
|
+
# See the docs for PDF::Reader::Page to read more about the
|
203
|
+
# methods available on each page
|
204
|
+
#
|
205
|
+
def page(num)
|
206
|
+
num = num.to_i
|
207
|
+
raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
|
208
|
+
PDF::Reader::Page.new(@objects, num)
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
213
|
+
# eventually be removed
|
214
|
+
#
|
215
|
+
# Given an IO object that contains PDF data, parse it.
|
216
|
+
#
|
217
|
+
def parse(io, receivers, opts = {})
|
218
|
+
ohash = ObjectHash.new(io)
|
219
|
+
|
220
|
+
if ohash.trailer[:Encrypt]
|
221
|
+
raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
222
|
+
end
|
223
|
+
|
224
|
+
options = {:pages => true, :raw_text => false, :metadata => true}
|
225
|
+
options.merge!(opts)
|
226
|
+
|
227
|
+
strategies.each do |s|
|
228
|
+
s.new(ohash, receivers, options).process
|
229
|
+
end
|
230
|
+
|
231
|
+
self
|
232
|
+
end
|
233
|
+
|
234
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
235
|
+
# eventually be removed
|
236
|
+
#
|
237
|
+
# Given an IO object that contains PDF data, return the contents of a single object
|
238
|
+
#
|
239
|
+
def object (io, id, gen)
|
240
|
+
@objects = ObjectHash.new(io)
|
241
|
+
|
242
|
+
@objects.deref(Reference.new(id, gen))
|
243
|
+
end
|
244
|
+
|
245
|
+
private
|
246
|
+
|
247
|
+
def strategies
|
248
|
+
@strategies ||= [
|
249
|
+
::PDF::Reader::MetadataStrategy,
|
250
|
+
::PDF::Reader::PagesStrategy
|
251
|
+
]
|
252
|
+
end
|
253
|
+
|
254
|
+
def root
|
255
|
+
root ||= @objects.deref(@objects.trailer[:Root])
|
256
|
+
end
|
257
|
+
|
258
|
+
def get_metadata
|
259
|
+
stream = @objects.deref(root[:Metadata])
|
260
|
+
stream ? stream.unfiltered_data : nil
|
261
|
+
end
|
262
|
+
|
263
|
+
def get_page_count
|
264
|
+
pages = @objects.deref(root[:Pages])
|
265
|
+
pages[:Count]
|
266
|
+
end
|
267
|
+
|
268
|
+
end
|
269
|
+
end
|
270
|
+
################################################################################
|
271
|
+
|
272
|
+
require 'pdf/reader/abstract_strategy'
|
273
|
+
require 'pdf/reader/buffer'
|
274
|
+
require 'pdf/reader/cmap'
|
275
|
+
require 'pdf/reader/encoding'
|
276
|
+
require 'pdf/reader/error'
|
277
|
+
require 'pdf/reader/filter'
|
278
|
+
require 'pdf/reader/font'
|
279
|
+
require 'pdf/reader/form_xobject'
|
280
|
+
require 'pdf/reader/lzw'
|
281
|
+
require 'pdf/reader/metadata_strategy'
|
282
|
+
require 'pdf/reader/object_cache'
|
283
|
+
require 'pdf/reader/object_hash'
|
284
|
+
require 'pdf/reader/object_stream'
|
285
|
+
require 'pdf/reader/pages_strategy'
|
286
|
+
require 'pdf/reader/parser'
|
287
|
+
require 'pdf/reader/print_receiver'
|
288
|
+
require 'pdf/reader/reference'
|
289
|
+
require 'pdf/reader/register_receiver'
|
290
|
+
require 'pdf/reader/stream'
|
291
|
+
require 'pdf/reader/text_receiver'
|
292
|
+
require 'pdf/reader/page_text_receiver'
|
293
|
+
require 'pdf/reader/token'
|
294
|
+
require 'pdf/reader/xref'
|
295
|
+
require 'pdf/reader/page'
|
296
|
+
require 'pdf/hash'
|
data/lib/pdf-reader.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "pdf/reader"
|