pdf-reader 0.8.6 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
@@ -0,0 +1,53 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
class MetadataStrategy < AbstractStrategy # :nodoc:
|
6
|
+
|
7
|
+
def self.to_sym
|
8
|
+
:metadata
|
9
|
+
end
|
10
|
+
|
11
|
+
def process
|
12
|
+
return false unless options[:metadata]
|
13
|
+
|
14
|
+
# may be useful to some people
|
15
|
+
callback(:pdf_version, ohash.pdf_version)
|
16
|
+
|
17
|
+
# ye olde metadata
|
18
|
+
callback(:metadata, [decoded_info]) if info?
|
19
|
+
|
20
|
+
# new style xml metadata
|
21
|
+
callback(:xml_metadata, [xml_metadata]) if xml_metadata?
|
22
|
+
|
23
|
+
# page count
|
24
|
+
if pages?
|
25
|
+
count = ohash.object(pages[:Count])
|
26
|
+
callback(:page_count, count.to_i)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def xml_metadata
|
33
|
+
return @xml_metadata if defined?(@xml_metadata)
|
34
|
+
|
35
|
+
if root[:Metadata].nil?
|
36
|
+
@xml_metadata = nil
|
37
|
+
else
|
38
|
+
string = ohash.object(root[:Metadata]).unfiltered_data
|
39
|
+
string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
|
40
|
+
@xml_metadata = string
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def xml_metadata?
|
45
|
+
xml_metadata ? true : false
|
46
|
+
end
|
47
|
+
|
48
|
+
def decoded_info
|
49
|
+
@decoded_info ||= decode_strings(info)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# Provides low level access to the objects in a PDF file via a hash-like
|
5
|
+
# object.
|
6
|
+
#
|
7
|
+
# A PDF file can be viewed as a large hash map. It is a series of objects
|
8
|
+
# stored at an exact byte offsets, and a table that maps object IDs to byte
|
9
|
+
# offsets. Given an object ID, looking up an object is an O(1) operation.
|
10
|
+
#
|
11
|
+
# Each PDF object can be mapped to a ruby object, so by passing an object
|
12
|
+
# ID to the [] method, a ruby representation of that object will be
|
13
|
+
# retrieved.
|
14
|
+
#
|
15
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
16
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
17
|
+
# is read only.
|
18
|
+
#
|
19
|
+
# == Basic Usage
|
20
|
+
#
|
21
|
+
# h = PDF::Reader::ObjectHash.new("somefile.pdf")
|
22
|
+
# h[1]
|
23
|
+
# => 3469
|
24
|
+
#
|
25
|
+
# h[PDF::Reader::Reference.new(1,0)]
|
26
|
+
# => 3469
|
27
|
+
#
|
28
|
+
class ObjectHash
|
29
|
+
include Enumerable
|
30
|
+
|
31
|
+
attr_accessor :default
|
32
|
+
attr_reader :trailer, :pdf_version
|
33
|
+
|
34
|
+
# Creates a new ObjectHash object. input can be a string with a valid filename,
|
35
|
+
# a string containing a PDF file, or an IO object.
|
36
|
+
#
|
37
|
+
def initialize(input)
|
38
|
+
if input.respond_to?(:seek) && input.respond_to?(:read)
|
39
|
+
@io = input
|
40
|
+
elsif File.file?(input.to_s)
|
41
|
+
if File.respond_to?(:binread)
|
42
|
+
input = File.binread(input.to_s)
|
43
|
+
else
|
44
|
+
input = File.read(input.to_s)
|
45
|
+
end
|
46
|
+
@io = StringIO.new(input)
|
47
|
+
else
|
48
|
+
raise ArgumentError, "input must be an IO-like object or a filename"
|
49
|
+
end
|
50
|
+
@pdf_version = read_version
|
51
|
+
@xref = PDF::Reader::XRef.new(@io)
|
52
|
+
@trailer = @xref.trailer
|
53
|
+
end
|
54
|
+
|
55
|
+
# returns the type of object a ref points to
|
56
|
+
def obj_type(ref)
|
57
|
+
self[ref].class.to_s.to_sym
|
58
|
+
rescue
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
|
62
|
+
# returns true if the supplied references points to an object with a stream
|
63
|
+
def stream?(ref)
|
64
|
+
self[ref].class == PDF::Reader::Stream
|
65
|
+
rescue
|
66
|
+
false
|
67
|
+
end
|
68
|
+
|
69
|
+
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
70
|
+
# object.
|
71
|
+
#
|
72
|
+
# If an int is used, the object with that ID and a generation number of 0 will
|
73
|
+
# be returned.
|
74
|
+
#
|
75
|
+
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
76
|
+
# can be specified.
|
77
|
+
#
|
78
|
+
def [](key)
|
79
|
+
return default if key.to_i <= 0
|
80
|
+
begin
|
81
|
+
unless key.kind_of?(PDF::Reader::Reference)
|
82
|
+
key = PDF::Reader::Reference.new(key.to_i, 0)
|
83
|
+
end
|
84
|
+
if xref[key].is_a?(Fixnum)
|
85
|
+
buf = new_buffer(xref[key])
|
86
|
+
Parser.new(buf, self).object(key.id, key.gen)
|
87
|
+
elsif xref[key].is_a?(PDF::Reader::Reference)
|
88
|
+
container_key = xref[key]
|
89
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
90
|
+
object_streams[container_key][key.id]
|
91
|
+
end
|
92
|
+
rescue InvalidObjectError
|
93
|
+
return default
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
98
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
99
|
+
#
|
100
|
+
def object(key)
|
101
|
+
key.is_a?(PDF::Reader::Reference) ? self[key] : key
|
102
|
+
end
|
103
|
+
|
104
|
+
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
105
|
+
# object.
|
106
|
+
#
|
107
|
+
# If an int is used, the object with that ID and a generation number of 0 will
|
108
|
+
# be returned.
|
109
|
+
#
|
110
|
+
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
111
|
+
# can be specified.
|
112
|
+
#
|
113
|
+
# local_default is the object that will be returned if the requested key doesn't
|
114
|
+
# exist.
|
115
|
+
#
|
116
|
+
def fetch(key, local_default = nil)
|
117
|
+
obj = self[key]
|
118
|
+
if obj
|
119
|
+
return obj
|
120
|
+
elsif local_default
|
121
|
+
return local_default
|
122
|
+
else
|
123
|
+
raise IndexError, "#{key} is invalid" if key.to_i <= 0
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# iterate over each key, value. Just like a ruby hash.
|
128
|
+
#
|
129
|
+
def each(&block)
|
130
|
+
@xref.each do |ref|
|
131
|
+
yield ref, self[ref]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
alias :each_pair :each
|
135
|
+
|
136
|
+
# iterate over each key. Just like a ruby hash.
|
137
|
+
#
|
138
|
+
def each_key(&block)
|
139
|
+
each do |id, obj|
|
140
|
+
yield id
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# iterate over each value. Just like a ruby hash.
|
145
|
+
#
|
146
|
+
def each_value(&block)
|
147
|
+
each do |id, obj|
|
148
|
+
yield obj
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# return the number of objects in the file. An object with multiple generations
|
153
|
+
# is counted once.
|
154
|
+
def size
|
155
|
+
xref.size
|
156
|
+
end
|
157
|
+
alias :length :size
|
158
|
+
|
159
|
+
# return true if there are no objects in this file
|
160
|
+
#
|
161
|
+
def empty?
|
162
|
+
size == 0 ? true : false
|
163
|
+
end
|
164
|
+
|
165
|
+
# return true if the specified key exists in the file. key
|
166
|
+
# can be an int or a PDF::Reader::Reference
|
167
|
+
#
|
168
|
+
def has_key?(check_key)
|
169
|
+
# TODO update from O(n) to O(1)
|
170
|
+
each_key do |key|
|
171
|
+
if check_key.kind_of?(PDF::Reader::Reference)
|
172
|
+
return true if check_key == key
|
173
|
+
else
|
174
|
+
return true if check_key.to_i == key.id
|
175
|
+
end
|
176
|
+
end
|
177
|
+
return false
|
178
|
+
end
|
179
|
+
alias :include? :has_key?
|
180
|
+
alias :key? :has_key?
|
181
|
+
alias :member? :has_key?
|
182
|
+
|
183
|
+
# return true if the specifiedvalue exists in the file
|
184
|
+
#
|
185
|
+
def has_value?(value)
|
186
|
+
# TODO update from O(n) to O(1)
|
187
|
+
each_value do |obj|
|
188
|
+
return true if obj == value
|
189
|
+
end
|
190
|
+
return false
|
191
|
+
end
|
192
|
+
alias :value? :has_key?
|
193
|
+
|
194
|
+
def to_s
|
195
|
+
"<PDF::Reader::ObejctHash size: #{self.size}>"
|
196
|
+
end
|
197
|
+
|
198
|
+
# return an array of all keys in the file
|
199
|
+
#
|
200
|
+
def keys
|
201
|
+
ret = []
|
202
|
+
each_key { |k| ret << k }
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
|
206
|
+
# return an array of all values in the file
|
207
|
+
#
|
208
|
+
def values
|
209
|
+
ret = []
|
210
|
+
each_value { |v| ret << v }
|
211
|
+
ret
|
212
|
+
end
|
213
|
+
|
214
|
+
# return an array of all values from the specified keys
|
215
|
+
#
|
216
|
+
def values_at(*ids)
|
217
|
+
ids.map { |id| self[id] }
|
218
|
+
end
|
219
|
+
|
220
|
+
# return an array of arrays. Each sub array contains a key/value pair.
|
221
|
+
#
|
222
|
+
def to_a
|
223
|
+
ret = []
|
224
|
+
each do |id, obj|
|
225
|
+
ret << [id, obj]
|
226
|
+
end
|
227
|
+
ret
|
228
|
+
end
|
229
|
+
|
230
|
+
# returns an array of PDF::Reader::References. Each reference in the
|
231
|
+
# array points a Page object, one for each page in the PDF. The first
|
232
|
+
# reference is page 1, second reference is page 2, etc.
|
233
|
+
#
|
234
|
+
# Useful for apps that want to extract data from specific pages.
|
235
|
+
#
|
236
|
+
def page_references
|
237
|
+
root = fetch(trailer[:Root])
|
238
|
+
@page_references ||= get_page_objects(root[:Pages]).flatten
|
239
|
+
end
|
240
|
+
|
241
|
+
private
|
242
|
+
|
243
|
+
def new_buffer(offset = 0)
|
244
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
245
|
+
end
|
246
|
+
|
247
|
+
def xref
|
248
|
+
@xref
|
249
|
+
end
|
250
|
+
|
251
|
+
def object_streams
|
252
|
+
@object_stream ||= {}
|
253
|
+
end
|
254
|
+
|
255
|
+
# returns a nested array of object references for all pages in this object store.
|
256
|
+
#
|
257
|
+
def get_page_objects(ref)
|
258
|
+
obj = fetch(ref)
|
259
|
+
|
260
|
+
if obj[:Type] == :Page
|
261
|
+
ref
|
262
|
+
elsif obj[:Type] == :Pages
|
263
|
+
obj[:Kids].map { |kid| get_page_objects(kid) }
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def read_version
|
268
|
+
@io.seek(0)
|
269
|
+
m, version = *@io.read(10).match(/PDF-(\d.\d)/)
|
270
|
+
@io.seek(0)
|
271
|
+
version.to_f
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# provides a wrapper around a PDF stream object that contains other objects in it.
|
6
|
+
# This is done for added compression and is described as an "Object Stream" in the spec.
|
7
|
+
#
|
8
|
+
class ObjectStream # :nodoc:
|
9
|
+
def initialize(stream)
|
10
|
+
@dict = stream.hash
|
11
|
+
@data = stream.unfiltered_data
|
12
|
+
end
|
13
|
+
|
14
|
+
def [](objid)
|
15
|
+
if offsets[objid].nil?
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
buf = PDF::Reader::Buffer.new(StringIO.new(@data), :seek => offsets[objid])
|
19
|
+
parser = PDF::Reader::Parser.new(buf)
|
20
|
+
parser.parse_token
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def size
|
25
|
+
@dict[:N]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def offsets
|
31
|
+
@offsets ||= {}
|
32
|
+
return @offsets if @offsets.keys.size > 0
|
33
|
+
|
34
|
+
size.times do
|
35
|
+
@offsets[buffer.token.to_i] = first + buffer.token.to_i
|
36
|
+
end
|
37
|
+
@offsets
|
38
|
+
end
|
39
|
+
|
40
|
+
def first
|
41
|
+
@dict[:First]
|
42
|
+
end
|
43
|
+
|
44
|
+
def buffer
|
45
|
+
@buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
@@ -22,12 +22,11 @@
|
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
#
|
24
24
|
################################################################################
|
25
|
-
require 'stringio'
|
26
25
|
|
27
26
|
class PDF::Reader
|
28
27
|
################################################################################
|
29
|
-
# Walks the PDF file and calls the appropriate callback methods when
|
30
|
-
# found.
|
28
|
+
# Walks the pages of the PDF file and calls the appropriate callback methods when
|
29
|
+
# something of interest is found.
|
31
30
|
#
|
32
31
|
# The callback methods should exist on the receiver object passed into the constructor. Whenever
|
33
32
|
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
@@ -78,6 +77,14 @@ class PDF::Reader
|
|
78
77
|
# - move_to_next_line_and_show_text
|
79
78
|
# - set_spacing_next_line_show_text
|
80
79
|
#
|
80
|
+
# If the :raw_text option was passed to the PDF::Reader class the following callbacks
|
81
|
+
# may also appear:
|
82
|
+
#
|
83
|
+
# - show_text_raw
|
84
|
+
# - show_text_with_positioning_raw
|
85
|
+
# - move_to_next_line_and_show_text_raw
|
86
|
+
# - set_spacing_next_line_show_text_raw
|
87
|
+
#
|
81
88
|
# == Graphics Callbacks
|
82
89
|
# - close_fill_stroke
|
83
90
|
# - fill_stroke
|
@@ -168,7 +175,7 @@ class PDF::Reader
|
|
168
175
|
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
169
176
|
# invoke_xobject "IM1".
|
170
177
|
#
|
171
|
-
class
|
178
|
+
class PagesStrategy< AbstractStrategy # :nodoc:
|
172
179
|
OPERATORS = {
|
173
180
|
'b' => :close_fill_stroke,
|
174
181
|
'B' => :fill_stroke,
|
@@ -244,45 +251,19 @@ class PDF::Reader
|
|
244
251
|
'\'' => :move_to_next_line_and_show_text,
|
245
252
|
'"' => :set_spacing_next_line_show_text,
|
246
253
|
}
|
247
|
-
|
248
|
-
|
249
|
-
# - receiver - an object containing the required callback methods
|
250
|
-
# - xref - a PDF::Reader::Xref object that contains references to all the objects in a PDF file
|
251
|
-
def initialize (receiver, xref)
|
252
|
-
@receiver = receiver
|
253
|
-
@xref = xref
|
254
|
-
end
|
255
|
-
################################################################################
|
256
|
-
# Begin processing the document metadata
|
257
|
-
def metadata (root, info)
|
258
|
-
info = decode_strings(info)
|
259
|
-
|
260
|
-
# may be useful to some people
|
261
|
-
callback(:pdf_version, @xref.pdf_version)
|
262
|
-
|
263
|
-
# ye olde metadata
|
264
|
-
callback(:metadata, [info]) if info
|
265
|
-
|
266
|
-
# new style xml metadata
|
267
|
-
if root[:Metadata]
|
268
|
-
stream = @xref.object(root[:Metadata])
|
269
|
-
callback(:xml_metadata,stream.unfiltered_data)
|
270
|
-
end
|
271
|
-
|
272
|
-
# page count
|
273
|
-
if (pages = @xref.object(root[:Pages]))
|
274
|
-
if (count = @xref.object(pages[:Count]))
|
275
|
-
callback(:page_count, count.to_i)
|
276
|
-
end
|
277
|
-
end
|
254
|
+
def self.to_sym
|
255
|
+
:pages
|
278
256
|
end
|
279
257
|
################################################################################
|
280
258
|
# Begin processing the document
|
281
|
-
def
|
259
|
+
def process
|
260
|
+
return false unless options[:pages]
|
261
|
+
|
282
262
|
callback(:begin_document, [root])
|
283
|
-
walk_pages(@
|
263
|
+
walk_pages(@ohash.object(root[:Pages]))
|
284
264
|
callback(:end_document)
|
285
265
|
end
|
266
|
+
private
|
286
267
|
################################################################################
|
287
268
|
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
288
269
|
# its content
|
@@ -291,19 +272,19 @@ class PDF::Reader
|
|
291
272
|
# extract page content
|
292
273
|
if page[:Type] == :Pages
|
293
274
|
callback(:begin_page_container, [page])
|
294
|
-
res = @
|
275
|
+
res = @ohash.object(page[:Resources])
|
295
276
|
resources.push res if res
|
296
|
-
@
|
277
|
+
@ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
|
297
278
|
resources.pop if res
|
298
279
|
callback(:end_page_container)
|
299
280
|
elsif page[:Type] == :Page
|
300
281
|
callback(:begin_page, [page])
|
301
|
-
res = @
|
282
|
+
res = @ohash.object(page[:Resources])
|
302
283
|
resources.push res if res
|
303
284
|
walk_resources(current_resources)
|
304
285
|
|
305
|
-
if @
|
306
|
-
contents = @
|
286
|
+
if @ohash.object(page[:Contents]).kind_of?(Array)
|
287
|
+
contents = @ohash.object(page[:Contents])
|
307
288
|
else
|
308
289
|
contents = [page[:Contents]]
|
309
290
|
end
|
@@ -311,10 +292,8 @@ class PDF::Reader
|
|
311
292
|
fonts = font_hash_from_resources(current_resources)
|
312
293
|
|
313
294
|
if page.has_key?(:Contents) and page[:Contents]
|
314
|
-
contents.
|
315
|
-
|
316
|
-
content_stream(obj, fonts)
|
317
|
-
end
|
295
|
+
direct_contents = contents.map { |content| @ohash.object(content) }
|
296
|
+
content_stream(direct_contents, fonts)
|
318
297
|
end
|
319
298
|
|
320
299
|
resources.pop if res
|
@@ -326,12 +305,12 @@ class PDF::Reader
|
|
326
305
|
# like a regular page content stream.
|
327
306
|
#
|
328
307
|
def walk_xobject_form(label)
|
329
|
-
xobjects = @
|
330
|
-
xobject = @
|
308
|
+
xobjects = @ohash.object(current_resources[:XObject]) || {}
|
309
|
+
xobject = @ohash.object(xobjects[label])
|
331
310
|
|
332
311
|
if xobject && xobject.hash[:Subtype] == :Form
|
333
312
|
callback(:begin_form_xobject)
|
334
|
-
resources = @
|
313
|
+
resources = @ohash.object(xobject.hash[:Resources])
|
335
314
|
walk_resources(resources) if resources
|
336
315
|
fonts = font_hash_from_resources(resources)
|
337
316
|
content_stream(xobject, fonts)
|
@@ -352,30 +331,40 @@ class PDF::Reader
|
|
352
331
|
################################################################################
|
353
332
|
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
354
333
|
# it contains
|
334
|
+
#
|
355
335
|
def content_stream (instructions, fonts = {})
|
356
|
-
instructions = instructions
|
357
|
-
|
358
|
-
|
336
|
+
instructions = [instructions] unless instructions.kind_of?(Array)
|
337
|
+
instructions = instructions.map { |ins|
|
338
|
+
ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
|
339
|
+
}.join
|
340
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
341
|
+
parser = Parser.new(buffer, @ohash)
|
359
342
|
current_font = nil
|
360
343
|
params = []
|
361
344
|
|
362
345
|
while (token = parser.parse_token(OPERATORS))
|
363
346
|
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
364
|
-
|
347
|
+
if OPERATORS[token] == :set_text_font_and_size
|
348
|
+
current_font = params.first
|
349
|
+
if fonts[current_font].nil?
|
350
|
+
raise MalformedPDFError, "Unknown font #{current_font}"
|
351
|
+
end
|
352
|
+
end
|
365
353
|
|
366
354
|
# handle special cases in response to certain operators
|
367
|
-
if OPERATORS[token].to_s.include?("show_text")
|
368
|
-
# convert any text to utf-8
|
355
|
+
if OPERATORS[token].to_s.include?("show_text")
|
356
|
+
# convert any text to utf-8, but output the raw string if the user wants it
|
357
|
+
if options[:raw_text]
|
358
|
+
callback("#{OPERATORS[token]}_raw".to_sym, params)
|
359
|
+
end
|
369
360
|
params = fonts[current_font].to_utf8(params)
|
370
361
|
elsif token == "ID"
|
371
362
|
# inline image data, first convert the current params into a more familiar hash
|
372
363
|
map = {}
|
373
|
-
params.each_slice(2) do |
|
374
|
-
map[
|
364
|
+
params.each_slice(2) do |key, value|
|
365
|
+
map[key] = value
|
375
366
|
end
|
376
|
-
params = [map]
|
377
|
-
# read the raw image data from the buffer without tokenising
|
378
|
-
params << buffer.read_until("EI")
|
367
|
+
params = [map, buffer.token]
|
379
368
|
end
|
380
369
|
|
381
370
|
callback(OPERATORS[token], params)
|
@@ -407,29 +396,29 @@ class PDF::Reader
|
|
407
396
|
|
408
397
|
# extract any xobject information
|
409
398
|
if resources[:XObject]
|
410
|
-
@
|
411
|
-
callback(:resource_xobject, [name, @
|
399
|
+
@ohash.object(resources[:XObject]).each do |name, val|
|
400
|
+
callback(:resource_xobject, [name, @ohash.object(val)])
|
412
401
|
end
|
413
402
|
end
|
414
403
|
|
415
404
|
# extract any extgstate information
|
416
405
|
if resources[:ExtGState]
|
417
|
-
@
|
418
|
-
callback(:resource_extgstate, [name, @
|
406
|
+
@ohash.object(resources[:ExtGState]).each do |name, val|
|
407
|
+
callback(:resource_extgstate, [name, @ohash.object(val)])
|
419
408
|
end
|
420
409
|
end
|
421
410
|
|
422
411
|
# extract any colorspace information
|
423
412
|
if resources[:ColorSpace]
|
424
|
-
@
|
425
|
-
callback(:resource_colorspace, [name, @
|
413
|
+
@ohash.object(resources[:ColorSpace]).each do |name, val|
|
414
|
+
callback(:resource_colorspace, [name, @ohash.object(val)])
|
426
415
|
end
|
427
416
|
end
|
428
417
|
|
429
418
|
# extract any pattern information
|
430
419
|
if resources[:Pattern]
|
431
|
-
@
|
432
|
-
callback(:resource_pattern, [name, @
|
420
|
+
@ohash.object(resources[:Pattern]).each do |name, val|
|
421
|
+
callback(:resource_pattern, [name, @ohash.object(val)])
|
433
422
|
end
|
434
423
|
end
|
435
424
|
|
@@ -449,7 +438,7 @@ class PDF::Reader
|
|
449
438
|
obj.hash = resolve_references(obj.hash)
|
450
439
|
obj
|
451
440
|
when PDF::Reader::Reference then
|
452
|
-
resolve_references(@
|
441
|
+
resolve_references(@ohash.object(obj))
|
453
442
|
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
454
443
|
when Array then obj.collect { |item| resolve_references(item) }
|
455
444
|
else
|
@@ -457,53 +446,27 @@ class PDF::Reader
|
|
457
446
|
end
|
458
447
|
end
|
459
448
|
################################################################################
|
460
|
-
# calls the name callback method on the receiver class with params as the arguments
|
461
|
-
def callback (name, params=[])
|
462
|
-
@receiver.send(name, *params) if @receiver.respond_to?(name)
|
463
|
-
end
|
464
|
-
################################################################################
|
465
|
-
private
|
466
449
|
################################################################################
|
467
450
|
def font_hash_from_resources(resources)
|
468
451
|
return {} unless resources.respond_to?(:[])
|
469
452
|
|
470
453
|
fonts = {}
|
471
|
-
resources = @
|
454
|
+
resources = @ohash.object(resources[:Font]) || {}
|
472
455
|
resources.each do |label, desc|
|
473
|
-
desc = @
|
456
|
+
desc = @ohash.object(desc)
|
474
457
|
fonts[label] = PDF::Reader::Font.new
|
475
458
|
fonts[label].label = label
|
476
459
|
fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
477
460
|
fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
478
|
-
fonts[label].encoding = PDF::Reader::Encoding.new(@
|
461
|
+
fonts[label].encoding = PDF::Reader::Encoding.new(@ohash.object(desc[:Encoding]))
|
479
462
|
fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
480
463
|
if desc[:ToUnicode]
|
481
|
-
|
482
|
-
|
483
|
-
stream = desc[:ToUnicode]
|
484
|
-
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
485
|
-
rescue
|
486
|
-
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
487
|
-
end
|
464
|
+
stream = @ohash.object(desc[:ToUnicode])
|
465
|
+
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
488
466
|
end
|
489
467
|
end
|
490
468
|
fonts
|
491
469
|
end
|
492
|
-
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
493
|
-
def decode_strings(obj)
|
494
|
-
case obj
|
495
|
-
when String then
|
496
|
-
if obj[0,2] == "\376\377"
|
497
|
-
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
498
|
-
else
|
499
|
-
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
500
|
-
end
|
501
|
-
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
502
|
-
when Array then obj.collect { |item| decode_strings(item) }
|
503
|
-
else
|
504
|
-
obj
|
505
|
-
end
|
506
|
-
end
|
507
470
|
def resources
|
508
471
|
@resources ||= []
|
509
472
|
end
|