pdf-reader 0.8.6 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
@@ -0,0 +1,53 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
class MetadataStrategy < AbstractStrategy # :nodoc:
|
6
|
+
|
7
|
+
def self.to_sym
|
8
|
+
:metadata
|
9
|
+
end
|
10
|
+
|
11
|
+
def process
|
12
|
+
return false unless options[:metadata]
|
13
|
+
|
14
|
+
# may be useful to some people
|
15
|
+
callback(:pdf_version, ohash.pdf_version)
|
16
|
+
|
17
|
+
# ye olde metadata
|
18
|
+
callback(:metadata, [decoded_info]) if info?
|
19
|
+
|
20
|
+
# new style xml metadata
|
21
|
+
callback(:xml_metadata, [xml_metadata]) if xml_metadata?
|
22
|
+
|
23
|
+
# page count
|
24
|
+
if pages?
|
25
|
+
count = ohash.object(pages[:Count])
|
26
|
+
callback(:page_count, count.to_i)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def xml_metadata
|
33
|
+
return @xml_metadata if defined?(@xml_metadata)
|
34
|
+
|
35
|
+
if root[:Metadata].nil?
|
36
|
+
@xml_metadata = nil
|
37
|
+
else
|
38
|
+
string = ohash.object(root[:Metadata]).unfiltered_data
|
39
|
+
string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
|
40
|
+
@xml_metadata = string
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def xml_metadata?
|
45
|
+
xml_metadata ? true : false
|
46
|
+
end
|
47
|
+
|
48
|
+
def decoded_info
|
49
|
+
@decoded_info ||= decode_strings(info)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# Provides low level access to the objects in a PDF file via a hash-like
|
5
|
+
# object.
|
6
|
+
#
|
7
|
+
# A PDF file can be viewed as a large hash map. It is a series of objects
|
8
|
+
# stored at an exact byte offsets, and a table that maps object IDs to byte
|
9
|
+
# offsets. Given an object ID, looking up an object is an O(1) operation.
|
10
|
+
#
|
11
|
+
# Each PDF object can be mapped to a ruby object, so by passing an object
|
12
|
+
# ID to the [] method, a ruby representation of that object will be
|
13
|
+
# retrieved.
|
14
|
+
#
|
15
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
16
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
17
|
+
# is read only.
|
18
|
+
#
|
19
|
+
# == Basic Usage
|
20
|
+
#
|
21
|
+
# h = PDF::Reader::ObjectHash.new("somefile.pdf")
|
22
|
+
# h[1]
|
23
|
+
# => 3469
|
24
|
+
#
|
25
|
+
# h[PDF::Reader::Reference.new(1,0)]
|
26
|
+
# => 3469
|
27
|
+
#
|
28
|
+
class ObjectHash
|
29
|
+
include Enumerable
|
30
|
+
|
31
|
+
attr_accessor :default
|
32
|
+
attr_reader :trailer, :pdf_version
|
33
|
+
|
34
|
+
# Creates a new ObjectHash object. input can be a string with a valid filename,
|
35
|
+
# a string containing a PDF file, or an IO object.
|
36
|
+
#
|
37
|
+
def initialize(input)
|
38
|
+
if input.respond_to?(:seek) && input.respond_to?(:read)
|
39
|
+
@io = input
|
40
|
+
elsif File.file?(input.to_s)
|
41
|
+
if File.respond_to?(:binread)
|
42
|
+
input = File.binread(input.to_s)
|
43
|
+
else
|
44
|
+
input = File.read(input.to_s)
|
45
|
+
end
|
46
|
+
@io = StringIO.new(input)
|
47
|
+
else
|
48
|
+
raise ArgumentError, "input must be an IO-like object or a filename"
|
49
|
+
end
|
50
|
+
@pdf_version = read_version
|
51
|
+
@xref = PDF::Reader::XRef.new(@io)
|
52
|
+
@trailer = @xref.trailer
|
53
|
+
end
|
54
|
+
|
55
|
+
# returns the type of object a ref points to
|
56
|
+
def obj_type(ref)
|
57
|
+
self[ref].class.to_s.to_sym
|
58
|
+
rescue
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
|
62
|
+
# returns true if the supplied references points to an object with a stream
|
63
|
+
def stream?(ref)
|
64
|
+
self[ref].class == PDF::Reader::Stream
|
65
|
+
rescue
|
66
|
+
false
|
67
|
+
end
|
68
|
+
|
69
|
+
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
70
|
+
# object.
|
71
|
+
#
|
72
|
+
# If an int is used, the object with that ID and a generation number of 0 will
|
73
|
+
# be returned.
|
74
|
+
#
|
75
|
+
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
76
|
+
# can be specified.
|
77
|
+
#
|
78
|
+
def [](key)
|
79
|
+
return default if key.to_i <= 0
|
80
|
+
begin
|
81
|
+
unless key.kind_of?(PDF::Reader::Reference)
|
82
|
+
key = PDF::Reader::Reference.new(key.to_i, 0)
|
83
|
+
end
|
84
|
+
if xref[key].is_a?(Fixnum)
|
85
|
+
buf = new_buffer(xref[key])
|
86
|
+
Parser.new(buf, self).object(key.id, key.gen)
|
87
|
+
elsif xref[key].is_a?(PDF::Reader::Reference)
|
88
|
+
container_key = xref[key]
|
89
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
90
|
+
object_streams[container_key][key.id]
|
91
|
+
end
|
92
|
+
rescue InvalidObjectError
|
93
|
+
return default
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
98
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
99
|
+
#
|
100
|
+
def object(key)
|
101
|
+
key.is_a?(PDF::Reader::Reference) ? self[key] : key
|
102
|
+
end
|
103
|
+
|
104
|
+
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
105
|
+
# object.
|
106
|
+
#
|
107
|
+
# If an int is used, the object with that ID and a generation number of 0 will
|
108
|
+
# be returned.
|
109
|
+
#
|
110
|
+
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
111
|
+
# can be specified.
|
112
|
+
#
|
113
|
+
# local_default is the object that will be returned if the requested key doesn't
|
114
|
+
# exist.
|
115
|
+
#
|
116
|
+
def fetch(key, local_default = nil)
|
117
|
+
obj = self[key]
|
118
|
+
if obj
|
119
|
+
return obj
|
120
|
+
elsif local_default
|
121
|
+
return local_default
|
122
|
+
else
|
123
|
+
raise IndexError, "#{key} is invalid" if key.to_i <= 0
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# iterate over each key, value. Just like a ruby hash.
|
128
|
+
#
|
129
|
+
def each(&block)
|
130
|
+
@xref.each do |ref|
|
131
|
+
yield ref, self[ref]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
alias :each_pair :each
|
135
|
+
|
136
|
+
# iterate over each key. Just like a ruby hash.
|
137
|
+
#
|
138
|
+
def each_key(&block)
|
139
|
+
each do |id, obj|
|
140
|
+
yield id
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# iterate over each value. Just like a ruby hash.
|
145
|
+
#
|
146
|
+
def each_value(&block)
|
147
|
+
each do |id, obj|
|
148
|
+
yield obj
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# return the number of objects in the file. An object with multiple generations
|
153
|
+
# is counted once.
|
154
|
+
def size
|
155
|
+
xref.size
|
156
|
+
end
|
157
|
+
alias :length :size
|
158
|
+
|
159
|
+
# return true if there are no objects in this file
|
160
|
+
#
|
161
|
+
def empty?
|
162
|
+
size == 0 ? true : false
|
163
|
+
end
|
164
|
+
|
165
|
+
# return true if the specified key exists in the file. key
|
166
|
+
# can be an int or a PDF::Reader::Reference
|
167
|
+
#
|
168
|
+
def has_key?(check_key)
|
169
|
+
# TODO update from O(n) to O(1)
|
170
|
+
each_key do |key|
|
171
|
+
if check_key.kind_of?(PDF::Reader::Reference)
|
172
|
+
return true if check_key == key
|
173
|
+
else
|
174
|
+
return true if check_key.to_i == key.id
|
175
|
+
end
|
176
|
+
end
|
177
|
+
return false
|
178
|
+
end
|
179
|
+
alias :include? :has_key?
|
180
|
+
alias :key? :has_key?
|
181
|
+
alias :member? :has_key?
|
182
|
+
|
183
|
+
# return true if the specifiedvalue exists in the file
|
184
|
+
#
|
185
|
+
def has_value?(value)
|
186
|
+
# TODO update from O(n) to O(1)
|
187
|
+
each_value do |obj|
|
188
|
+
return true if obj == value
|
189
|
+
end
|
190
|
+
return false
|
191
|
+
end
|
192
|
+
alias :value? :has_key?
|
193
|
+
|
194
|
+
def to_s
|
195
|
+
"<PDF::Reader::ObejctHash size: #{self.size}>"
|
196
|
+
end
|
197
|
+
|
198
|
+
# return an array of all keys in the file
|
199
|
+
#
|
200
|
+
def keys
|
201
|
+
ret = []
|
202
|
+
each_key { |k| ret << k }
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
|
206
|
+
# return an array of all values in the file
|
207
|
+
#
|
208
|
+
def values
|
209
|
+
ret = []
|
210
|
+
each_value { |v| ret << v }
|
211
|
+
ret
|
212
|
+
end
|
213
|
+
|
214
|
+
# return an array of all values from the specified keys
|
215
|
+
#
|
216
|
+
def values_at(*ids)
|
217
|
+
ids.map { |id| self[id] }
|
218
|
+
end
|
219
|
+
|
220
|
+
# return an array of arrays. Each sub array contains a key/value pair.
|
221
|
+
#
|
222
|
+
def to_a
|
223
|
+
ret = []
|
224
|
+
each do |id, obj|
|
225
|
+
ret << [id, obj]
|
226
|
+
end
|
227
|
+
ret
|
228
|
+
end
|
229
|
+
|
230
|
+
# returns an array of PDF::Reader::References. Each reference in the
|
231
|
+
# array points a Page object, one for each page in the PDF. The first
|
232
|
+
# reference is page 1, second reference is page 2, etc.
|
233
|
+
#
|
234
|
+
# Useful for apps that want to extract data from specific pages.
|
235
|
+
#
|
236
|
+
def page_references
|
237
|
+
root = fetch(trailer[:Root])
|
238
|
+
@page_references ||= get_page_objects(root[:Pages]).flatten
|
239
|
+
end
|
240
|
+
|
241
|
+
private
|
242
|
+
|
243
|
+
def new_buffer(offset = 0)
|
244
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
245
|
+
end
|
246
|
+
|
247
|
+
def xref
|
248
|
+
@xref
|
249
|
+
end
|
250
|
+
|
251
|
+
def object_streams
|
252
|
+
@object_stream ||= {}
|
253
|
+
end
|
254
|
+
|
255
|
+
# returns a nested array of object references for all pages in this object store.
|
256
|
+
#
|
257
|
+
def get_page_objects(ref)
|
258
|
+
obj = fetch(ref)
|
259
|
+
|
260
|
+
if obj[:Type] == :Page
|
261
|
+
ref
|
262
|
+
elsif obj[:Type] == :Pages
|
263
|
+
obj[:Kids].map { |kid| get_page_objects(kid) }
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def read_version
|
268
|
+
@io.seek(0)
|
269
|
+
m, version = *@io.read(10).match(/PDF-(\d.\d)/)
|
270
|
+
@io.seek(0)
|
271
|
+
version.to_f
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# provides a wrapper around a PDF stream object that contains other objects in it.
|
6
|
+
# This is done for added compression and is described as an "Object Stream" in the spec.
|
7
|
+
#
|
8
|
+
class ObjectStream # :nodoc:
|
9
|
+
def initialize(stream)
|
10
|
+
@dict = stream.hash
|
11
|
+
@data = stream.unfiltered_data
|
12
|
+
end
|
13
|
+
|
14
|
+
def [](objid)
|
15
|
+
if offsets[objid].nil?
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
buf = PDF::Reader::Buffer.new(StringIO.new(@data), :seek => offsets[objid])
|
19
|
+
parser = PDF::Reader::Parser.new(buf)
|
20
|
+
parser.parse_token
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def size
|
25
|
+
@dict[:N]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def offsets
|
31
|
+
@offsets ||= {}
|
32
|
+
return @offsets if @offsets.keys.size > 0
|
33
|
+
|
34
|
+
size.times do
|
35
|
+
@offsets[buffer.token.to_i] = first + buffer.token.to_i
|
36
|
+
end
|
37
|
+
@offsets
|
38
|
+
end
|
39
|
+
|
40
|
+
def first
|
41
|
+
@dict[:First]
|
42
|
+
end
|
43
|
+
|
44
|
+
def buffer
|
45
|
+
@buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
@@ -22,12 +22,11 @@
|
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
#
|
24
24
|
################################################################################
|
25
|
-
require 'stringio'
|
26
25
|
|
27
26
|
class PDF::Reader
|
28
27
|
################################################################################
|
29
|
-
# Walks the PDF file and calls the appropriate callback methods when
|
30
|
-
# found.
|
28
|
+
# Walks the pages of the PDF file and calls the appropriate callback methods when
|
29
|
+
# something of interest is found.
|
31
30
|
#
|
32
31
|
# The callback methods should exist on the receiver object passed into the constructor. Whenever
|
33
32
|
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
@@ -78,6 +77,14 @@ class PDF::Reader
|
|
78
77
|
# - move_to_next_line_and_show_text
|
79
78
|
# - set_spacing_next_line_show_text
|
80
79
|
#
|
80
|
+
# If the :raw_text option was passed to the PDF::Reader class the following callbacks
|
81
|
+
# may also appear:
|
82
|
+
#
|
83
|
+
# - show_text_raw
|
84
|
+
# - show_text_with_positioning_raw
|
85
|
+
# - move_to_next_line_and_show_text_raw
|
86
|
+
# - set_spacing_next_line_show_text_raw
|
87
|
+
#
|
81
88
|
# == Graphics Callbacks
|
82
89
|
# - close_fill_stroke
|
83
90
|
# - fill_stroke
|
@@ -168,7 +175,7 @@ class PDF::Reader
|
|
168
175
|
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
169
176
|
# invoke_xobject "IM1".
|
170
177
|
#
|
171
|
-
class
|
178
|
+
class PagesStrategy< AbstractStrategy # :nodoc:
|
172
179
|
OPERATORS = {
|
173
180
|
'b' => :close_fill_stroke,
|
174
181
|
'B' => :fill_stroke,
|
@@ -244,45 +251,19 @@ class PDF::Reader
|
|
244
251
|
'\'' => :move_to_next_line_and_show_text,
|
245
252
|
'"' => :set_spacing_next_line_show_text,
|
246
253
|
}
|
247
|
-
|
248
|
-
|
249
|
-
# - receiver - an object containing the required callback methods
|
250
|
-
# - xref - a PDF::Reader::Xref object that contains references to all the objects in a PDF file
|
251
|
-
def initialize (receiver, xref)
|
252
|
-
@receiver = receiver
|
253
|
-
@xref = xref
|
254
|
-
end
|
255
|
-
################################################################################
|
256
|
-
# Begin processing the document metadata
|
257
|
-
def metadata (root, info)
|
258
|
-
info = decode_strings(info)
|
259
|
-
|
260
|
-
# may be useful to some people
|
261
|
-
callback(:pdf_version, @xref.pdf_version)
|
262
|
-
|
263
|
-
# ye olde metadata
|
264
|
-
callback(:metadata, [info]) if info
|
265
|
-
|
266
|
-
# new style xml metadata
|
267
|
-
if root[:Metadata]
|
268
|
-
stream = @xref.object(root[:Metadata])
|
269
|
-
callback(:xml_metadata,stream.unfiltered_data)
|
270
|
-
end
|
271
|
-
|
272
|
-
# page count
|
273
|
-
if (pages = @xref.object(root[:Pages]))
|
274
|
-
if (count = @xref.object(pages[:Count]))
|
275
|
-
callback(:page_count, count.to_i)
|
276
|
-
end
|
277
|
-
end
|
254
|
+
def self.to_sym
|
255
|
+
:pages
|
278
256
|
end
|
279
257
|
################################################################################
|
280
258
|
# Begin processing the document
|
281
|
-
def
|
259
|
+
def process
|
260
|
+
return false unless options[:pages]
|
261
|
+
|
282
262
|
callback(:begin_document, [root])
|
283
|
-
walk_pages(@
|
263
|
+
walk_pages(@ohash.object(root[:Pages]))
|
284
264
|
callback(:end_document)
|
285
265
|
end
|
266
|
+
private
|
286
267
|
################################################################################
|
287
268
|
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
288
269
|
# its content
|
@@ -291,19 +272,19 @@ class PDF::Reader
|
|
291
272
|
# extract page content
|
292
273
|
if page[:Type] == :Pages
|
293
274
|
callback(:begin_page_container, [page])
|
294
|
-
res = @
|
275
|
+
res = @ohash.object(page[:Resources])
|
295
276
|
resources.push res if res
|
296
|
-
@
|
277
|
+
@ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
|
297
278
|
resources.pop if res
|
298
279
|
callback(:end_page_container)
|
299
280
|
elsif page[:Type] == :Page
|
300
281
|
callback(:begin_page, [page])
|
301
|
-
res = @
|
282
|
+
res = @ohash.object(page[:Resources])
|
302
283
|
resources.push res if res
|
303
284
|
walk_resources(current_resources)
|
304
285
|
|
305
|
-
if @
|
306
|
-
contents = @
|
286
|
+
if @ohash.object(page[:Contents]).kind_of?(Array)
|
287
|
+
contents = @ohash.object(page[:Contents])
|
307
288
|
else
|
308
289
|
contents = [page[:Contents]]
|
309
290
|
end
|
@@ -311,10 +292,8 @@ class PDF::Reader
|
|
311
292
|
fonts = font_hash_from_resources(current_resources)
|
312
293
|
|
313
294
|
if page.has_key?(:Contents) and page[:Contents]
|
314
|
-
contents.
|
315
|
-
|
316
|
-
content_stream(obj, fonts)
|
317
|
-
end
|
295
|
+
direct_contents = contents.map { |content| @ohash.object(content) }
|
296
|
+
content_stream(direct_contents, fonts)
|
318
297
|
end
|
319
298
|
|
320
299
|
resources.pop if res
|
@@ -326,12 +305,12 @@ class PDF::Reader
|
|
326
305
|
# like a regular page content stream.
|
327
306
|
#
|
328
307
|
def walk_xobject_form(label)
|
329
|
-
xobjects = @
|
330
|
-
xobject = @
|
308
|
+
xobjects = @ohash.object(current_resources[:XObject]) || {}
|
309
|
+
xobject = @ohash.object(xobjects[label])
|
331
310
|
|
332
311
|
if xobject && xobject.hash[:Subtype] == :Form
|
333
312
|
callback(:begin_form_xobject)
|
334
|
-
resources = @
|
313
|
+
resources = @ohash.object(xobject.hash[:Resources])
|
335
314
|
walk_resources(resources) if resources
|
336
315
|
fonts = font_hash_from_resources(resources)
|
337
316
|
content_stream(xobject, fonts)
|
@@ -352,30 +331,40 @@ class PDF::Reader
|
|
352
331
|
################################################################################
|
353
332
|
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
354
333
|
# it contains
|
334
|
+
#
|
355
335
|
def content_stream (instructions, fonts = {})
|
356
|
-
instructions = instructions
|
357
|
-
|
358
|
-
|
336
|
+
instructions = [instructions] unless instructions.kind_of?(Array)
|
337
|
+
instructions = instructions.map { |ins|
|
338
|
+
ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
|
339
|
+
}.join
|
340
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
341
|
+
parser = Parser.new(buffer, @ohash)
|
359
342
|
current_font = nil
|
360
343
|
params = []
|
361
344
|
|
362
345
|
while (token = parser.parse_token(OPERATORS))
|
363
346
|
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
364
|
-
|
347
|
+
if OPERATORS[token] == :set_text_font_and_size
|
348
|
+
current_font = params.first
|
349
|
+
if fonts[current_font].nil?
|
350
|
+
raise MalformedPDFError, "Unknown font #{current_font}"
|
351
|
+
end
|
352
|
+
end
|
365
353
|
|
366
354
|
# handle special cases in response to certain operators
|
367
|
-
if OPERATORS[token].to_s.include?("show_text")
|
368
|
-
# convert any text to utf-8
|
355
|
+
if OPERATORS[token].to_s.include?("show_text")
|
356
|
+
# convert any text to utf-8, but output the raw string if the user wants it
|
357
|
+
if options[:raw_text]
|
358
|
+
callback("#{OPERATORS[token]}_raw".to_sym, params)
|
359
|
+
end
|
369
360
|
params = fonts[current_font].to_utf8(params)
|
370
361
|
elsif token == "ID"
|
371
362
|
# inline image data, first convert the current params into a more familiar hash
|
372
363
|
map = {}
|
373
|
-
params.each_slice(2) do |
|
374
|
-
map[
|
364
|
+
params.each_slice(2) do |key, value|
|
365
|
+
map[key] = value
|
375
366
|
end
|
376
|
-
params = [map]
|
377
|
-
# read the raw image data from the buffer without tokenising
|
378
|
-
params << buffer.read_until("EI")
|
367
|
+
params = [map, buffer.token]
|
379
368
|
end
|
380
369
|
|
381
370
|
callback(OPERATORS[token], params)
|
@@ -407,29 +396,29 @@ class PDF::Reader
|
|
407
396
|
|
408
397
|
# extract any xobject information
|
409
398
|
if resources[:XObject]
|
410
|
-
@
|
411
|
-
callback(:resource_xobject, [name, @
|
399
|
+
@ohash.object(resources[:XObject]).each do |name, val|
|
400
|
+
callback(:resource_xobject, [name, @ohash.object(val)])
|
412
401
|
end
|
413
402
|
end
|
414
403
|
|
415
404
|
# extract any extgstate information
|
416
405
|
if resources[:ExtGState]
|
417
|
-
@
|
418
|
-
callback(:resource_extgstate, [name, @
|
406
|
+
@ohash.object(resources[:ExtGState]).each do |name, val|
|
407
|
+
callback(:resource_extgstate, [name, @ohash.object(val)])
|
419
408
|
end
|
420
409
|
end
|
421
410
|
|
422
411
|
# extract any colorspace information
|
423
412
|
if resources[:ColorSpace]
|
424
|
-
@
|
425
|
-
callback(:resource_colorspace, [name, @
|
413
|
+
@ohash.object(resources[:ColorSpace]).each do |name, val|
|
414
|
+
callback(:resource_colorspace, [name, @ohash.object(val)])
|
426
415
|
end
|
427
416
|
end
|
428
417
|
|
429
418
|
# extract any pattern information
|
430
419
|
if resources[:Pattern]
|
431
|
-
@
|
432
|
-
callback(:resource_pattern, [name, @
|
420
|
+
@ohash.object(resources[:Pattern]).each do |name, val|
|
421
|
+
callback(:resource_pattern, [name, @ohash.object(val)])
|
433
422
|
end
|
434
423
|
end
|
435
424
|
|
@@ -449,7 +438,7 @@ class PDF::Reader
|
|
449
438
|
obj.hash = resolve_references(obj.hash)
|
450
439
|
obj
|
451
440
|
when PDF::Reader::Reference then
|
452
|
-
resolve_references(@
|
441
|
+
resolve_references(@ohash.object(obj))
|
453
442
|
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
454
443
|
when Array then obj.collect { |item| resolve_references(item) }
|
455
444
|
else
|
@@ -457,53 +446,27 @@ class PDF::Reader
|
|
457
446
|
end
|
458
447
|
end
|
459
448
|
################################################################################
|
460
|
-
# calls the name callback method on the receiver class with params as the arguments
|
461
|
-
def callback (name, params=[])
|
462
|
-
@receiver.send(name, *params) if @receiver.respond_to?(name)
|
463
|
-
end
|
464
|
-
################################################################################
|
465
|
-
private
|
466
449
|
################################################################################
|
467
450
|
def font_hash_from_resources(resources)
|
468
451
|
return {} unless resources.respond_to?(:[])
|
469
452
|
|
470
453
|
fonts = {}
|
471
|
-
resources = @
|
454
|
+
resources = @ohash.object(resources[:Font]) || {}
|
472
455
|
resources.each do |label, desc|
|
473
|
-
desc = @
|
456
|
+
desc = @ohash.object(desc)
|
474
457
|
fonts[label] = PDF::Reader::Font.new
|
475
458
|
fonts[label].label = label
|
476
459
|
fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
477
460
|
fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
478
|
-
fonts[label].encoding = PDF::Reader::Encoding.new(@
|
461
|
+
fonts[label].encoding = PDF::Reader::Encoding.new(@ohash.object(desc[:Encoding]))
|
479
462
|
fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
480
463
|
if desc[:ToUnicode]
|
481
|
-
|
482
|
-
|
483
|
-
stream = desc[:ToUnicode]
|
484
|
-
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
485
|
-
rescue
|
486
|
-
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
487
|
-
end
|
464
|
+
stream = @ohash.object(desc[:ToUnicode])
|
465
|
+
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
488
466
|
end
|
489
467
|
end
|
490
468
|
fonts
|
491
469
|
end
|
492
|
-
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
493
|
-
def decode_strings(obj)
|
494
|
-
case obj
|
495
|
-
when String then
|
496
|
-
if obj[0,2] == "\376\377"
|
497
|
-
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
498
|
-
else
|
499
|
-
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
500
|
-
end
|
501
|
-
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
502
|
-
when Array then obj.collect { |item| decode_strings(item) }
|
503
|
-
else
|
504
|
-
obj
|
505
|
-
end
|
506
|
-
end
|
507
470
|
def resources
|
508
471
|
@resources ||= []
|
509
472
|
end
|