pdf-reader 0.7.6 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -0
- data/README.rdoc +1 -171
- data/Rakefile +1 -1
- data/TODO +2 -0
- data/examples/callbacks.rb +16 -0
- data/examples/extract_bates.rb +63 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +37 -0
- data/lib/pdf/reader/content.rb +58 -18
- metadata +10 -3
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
v0.7.7 (11th September 2009)
|
2
|
+
- Trigger callbacks contained in Form XObjects when we encounter them in a
|
3
|
+
content stream
|
4
|
+
- Fix inheritance of page resources to comply with section 3.6.2
|
5
|
+
|
1
6
|
v0.7.6 (28th August 2009)
|
2
7
|
- Various bug fixes that increase the files we can successfully parse
|
3
8
|
- Treat float and integer tokens differently (thanks Neil)
|
data/README.rdoc
CHANGED
@@ -88,177 +88,7 @@ http://groups.google.com/group/pdf-reader
|
|
88
88
|
= Examples
|
89
89
|
|
90
90
|
The easiest way to explain how this works in practice is to show some examples.
|
91
|
-
|
92
|
-
== Naïve Page Counter
|
93
|
-
|
94
|
-
A simple app to count the number of pages in a PDF File.
|
95
|
-
|
96
|
-
require 'rubygems'
|
97
|
-
require 'pdf/reader'
|
98
|
-
|
99
|
-
class PageReceiver
|
100
|
-
attr_accessor :counter
|
101
|
-
|
102
|
-
def initialize
|
103
|
-
@counter = 0
|
104
|
-
end
|
105
|
-
|
106
|
-
# Called when page parsing ends
|
107
|
-
def end_page
|
108
|
-
@counter += 1
|
109
|
-
end
|
110
|
-
end
|
111
|
-
|
112
|
-
receiver = PageReceiver.new
|
113
|
-
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
114
|
-
puts "#{receiver.counter} pages"
|
115
|
-
|
116
|
-
== List all callbacks generated by a single PDF
|
117
|
-
|
118
|
-
WARNING: this will generate a *lot* of output, so you probably want to pipe
|
119
|
-
it through less or to a text file.
|
120
|
-
|
121
|
-
require 'rubygems'
|
122
|
-
require 'pdf/reader'
|
123
|
-
|
124
|
-
receiver = PDF::Reader::RegisterReceiver.new
|
125
|
-
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
126
|
-
receiver.callbacks.each do |cb|
|
127
|
-
puts cb
|
128
|
-
end
|
129
|
-
|
130
|
-
== Extract all text from a single PDF
|
131
|
-
|
132
|
-
class PageTextReceiver
|
133
|
-
attr_accessor :content
|
134
|
-
|
135
|
-
def initialize
|
136
|
-
@content = []
|
137
|
-
end
|
138
|
-
|
139
|
-
# Called when page parsing starts
|
140
|
-
def begin_page(arg = nil)
|
141
|
-
@content << ""
|
142
|
-
end
|
143
|
-
|
144
|
-
# record text that is drawn on the page
|
145
|
-
def show_text(string, *params)
|
146
|
-
@content.last << string.strip
|
147
|
-
end
|
148
|
-
|
149
|
-
# there's a few text callbacks, so make sure we process them all
|
150
|
-
alias :super_show_text :show_text
|
151
|
-
alias :move_to_next_line_and_show_text :show_text
|
152
|
-
alias :set_spacing_next_line_show_text :show_text
|
153
|
-
|
154
|
-
# this final text callback takes slightly different arguments
|
155
|
-
def show_text_with_positioning(*params)
|
156
|
-
params = params.first
|
157
|
-
params.each { |str| show_text(str) if str.kind_of?(String)}
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
receiver = PageTextReceiver.new
|
162
|
-
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
163
|
-
puts receiver.content.inspect
|
164
|
-
|
165
|
-
== Extract metadata only
|
166
|
-
|
167
|
-
require 'rubygems'
|
168
|
-
require 'pdf/reader'
|
169
|
-
|
170
|
-
class MetaDataReceiver
|
171
|
-
attr_accessor :regular
|
172
|
-
attr_accessor :xml
|
173
|
-
|
174
|
-
def metadata(data)
|
175
|
-
@regular = data
|
176
|
-
end
|
177
|
-
|
178
|
-
def metadata_xml(data)
|
179
|
-
@xml = data
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
receiver = MetaDataReceiver.new
|
184
|
-
pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
|
185
|
-
puts receiver.regular.inspect
|
186
|
-
puts receiver.xml.inspect
|
187
|
-
|
188
|
-
== Improved Page Counter
|
189
|
-
|
190
|
-
A simple app to display the number of pages in a PDF File.
|
191
|
-
|
192
|
-
require 'rubygems'
|
193
|
-
require 'pdf/reader'
|
194
|
-
|
195
|
-
class PageReceiver
|
196
|
-
attr_accessor :pages
|
197
|
-
|
198
|
-
# Called when page parsing ends
|
199
|
-
def page_count(arg)
|
200
|
-
@pages = arg
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
receiver = PageReceiver.new
|
205
|
-
pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
|
206
|
-
puts "#{receiver.pages} pages"
|
207
|
-
|
208
|
-
== Basic RSpec of a generated PDF
|
209
|
-
|
210
|
-
require 'rubygems'
|
211
|
-
require 'pdf/reader'
|
212
|
-
require 'pdf/writer'
|
213
|
-
require 'spec'
|
214
|
-
|
215
|
-
class PageTextReceiver
|
216
|
-
attr_accessor :content
|
217
|
-
|
218
|
-
def initialize
|
219
|
-
@content = []
|
220
|
-
end
|
221
|
-
|
222
|
-
# Called when page parsing starts
|
223
|
-
def begin_page(arg = nil)
|
224
|
-
@content << ""
|
225
|
-
end
|
226
|
-
|
227
|
-
def show_text(string, *params)
|
228
|
-
@content.last << string.strip
|
229
|
-
end
|
230
|
-
|
231
|
-
# there's a few text callbacks, so make sure we process them all
|
232
|
-
alias :super_show_text :show_text
|
233
|
-
alias :move_to_next_line_and_show_text :show_text
|
234
|
-
alias :set_spacing_next_line_show_text :show_text
|
235
|
-
|
236
|
-
def show_text_with_positioning(*params)
|
237
|
-
params = params.first
|
238
|
-
params.each { |str| show_text(str) if str.kind_of?(String)}
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
context "My generated PDF" do
|
243
|
-
specify "should have the correct text on 2 pages" do
|
244
|
-
|
245
|
-
# generate our PDF
|
246
|
-
pdf = PDF::Writer.new
|
247
|
-
pdf.text "Chunky", :font_size => 32, :justification => :center
|
248
|
-
pdf.start_new_page
|
249
|
-
pdf.text "Bacon", :font_size => 32, :justification => :center
|
250
|
-
pdf.save_as("chunkybacon.pdf")
|
251
|
-
|
252
|
-
# process the PDF
|
253
|
-
receiver = PageTextReceiver.new
|
254
|
-
PDF::Reader.file("chunkybacon.pdf", receiver)
|
255
|
-
|
256
|
-
# confirm the text appears on the correct pages
|
257
|
-
receiver.content.size.should eql(2)
|
258
|
-
receiver.content[0].should eql("Chunky")
|
259
|
-
receiver.content[1].should eql("Bacon")
|
260
|
-
end
|
261
|
-
end
|
91
|
+
Check out the examples/ directory for a few files.
|
262
92
|
|
263
93
|
= Known Limitations
|
264
94
|
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -16,6 +16,8 @@ v0.8
|
|
16
16
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
17
17
|
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
18
18
|
- Support Cross Reference Streams (spec 3.4.7)
|
19
|
+
- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
|
20
|
+
are inheritable. See table 3.2.7 in the spec
|
19
21
|
|
20
22
|
v0.9
|
21
23
|
- Add a way to extract raster images
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# List all callbacks generated by a single PDF
|
5
|
+
#
|
6
|
+
# WARNING: this will generate a *lot* of output, so you probably want to pipe
|
7
|
+
# it through less or to a text file.
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'pdf/reader'
|
11
|
+
|
12
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
13
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
14
|
+
receiver.callbacks.each do |cb|
|
15
|
+
puts cb
|
16
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A sample script that attempts to extract bates numbers from a PDF file.
|
5
|
+
# Bates numbers are often used to markup documents being used in legal
|
6
|
+
# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
|
7
|
+
#
|
8
|
+
# Acrobat 9 introduced a markup syntax that directly specifies the bates
|
9
|
+
# number for each page. For earlier versions, the easiest way to find
|
10
|
+
# the number is to look for words that match a pattern.
|
11
|
+
#
|
12
|
+
# This example attempts to extract numbers using the Acrobat 9 syntax.
|
13
|
+
# As a fall back, you can provide a regular expression that will be
|
14
|
+
# used to look for words that look like the numbers you expect in the
|
15
|
+
# page content.
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'pdf/reader'
|
19
|
+
|
20
|
+
class BatesReceiver
|
21
|
+
|
22
|
+
def initialize(regexp = nil)
|
23
|
+
@numbers = []
|
24
|
+
@backup = []
|
25
|
+
@regexp = regexp
|
26
|
+
end
|
27
|
+
|
28
|
+
def numbers
|
29
|
+
@numbers.size > 0 ? @numbers : @backup
|
30
|
+
end
|
31
|
+
|
32
|
+
# Called when page parsing starts
|
33
|
+
def begin_marked_content(*args)
|
34
|
+
return unless args.size >= 2
|
35
|
+
return unless args.first == :Artifact
|
36
|
+
return unless args[1][:Subtype] == :BatesN
|
37
|
+
|
38
|
+
@numbers << args[1][:Contents]
|
39
|
+
end
|
40
|
+
alias :begin_marked_content_with_pl :begin_marked_content
|
41
|
+
|
42
|
+
# record text that is drawn on the page
|
43
|
+
def show_text(string, *params)
|
44
|
+
return if @regexp.nil?
|
45
|
+
|
46
|
+
string.scan(@regexp).each { |m| @backup << m }
|
47
|
+
end
|
48
|
+
|
49
|
+
# there's a few text callbacks, so make sure we process them all
|
50
|
+
alias :super_show_text :show_text
|
51
|
+
alias :move_to_next_line_and_show_text :show_text
|
52
|
+
alias :set_spacing_next_line_show_text :show_text
|
53
|
+
|
54
|
+
# this final text callback takes slightly different arguments
|
55
|
+
def show_text_with_positioning(*params)
|
56
|
+
params = params.first
|
57
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
receiver = BatesReceiver.new(/CC.+/)
|
62
|
+
PDF::Reader.file("bates.pdf", receiver)
|
63
|
+
puts receiver.numbers.inspect
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# coding: utf-8
|
4
|
+
# Extract metadata only
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class MetaDataReceiver
|
10
|
+
attr_accessor :regular
|
11
|
+
attr_accessor :xml
|
12
|
+
|
13
|
+
def metadata(data)
|
14
|
+
@regular = data
|
15
|
+
end
|
16
|
+
|
17
|
+
def metadata_xml(data)
|
18
|
+
@xml = data
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
receiver = MetaDataReceiver.new
|
23
|
+
pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
|
24
|
+
puts receiver.regular.inspect
|
25
|
+
puts receiver.xml.inspect
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Improved Page Counter
|
5
|
+
#
|
6
|
+
# A simple app to display the number of pages in a PDF File.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'pdf/reader'
|
11
|
+
|
12
|
+
class PageReceiver
|
13
|
+
attr_accessor :pages
|
14
|
+
|
15
|
+
# Called when page parsing ends
|
16
|
+
def page_count(arg)
|
17
|
+
@pages = arg
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
receiver = PageReceiver.new
|
22
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
|
23
|
+
puts "#{receiver.pages} pages"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A simple app to count the number of pages in a PDF File.
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class PageReceiver
|
10
|
+
attr_accessor :counter
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@counter = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
# Called when page parsing ends
|
17
|
+
def end_page
|
18
|
+
@counter += 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
receiver = PageReceiver.new
|
23
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
24
|
+
puts "#{receiver.counter} pages"
|
data/examples/rspec.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Basic RSpec of a generated PDF
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
require 'pdf/writer'
|
9
|
+
require 'spec'
|
10
|
+
|
11
|
+
class PageTextReceiver
|
12
|
+
attr_accessor :content
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@content = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Called when page parsing starts
|
19
|
+
def begin_page(arg = nil)
|
20
|
+
@content << ""
|
21
|
+
end
|
22
|
+
|
23
|
+
def show_text(string, *params)
|
24
|
+
@content.last << string.strip
|
25
|
+
end
|
26
|
+
|
27
|
+
# there's a few text callbacks, so make sure we process them all
|
28
|
+
alias :super_show_text :show_text
|
29
|
+
alias :move_to_next_line_and_show_text :show_text
|
30
|
+
alias :set_spacing_next_line_show_text :show_text
|
31
|
+
|
32
|
+
def show_text_with_positioning(*params)
|
33
|
+
params = params.first
|
34
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context "My generated PDF" do
|
39
|
+
specify "should have the correct text on 2 pages" do
|
40
|
+
|
41
|
+
# generate our PDF
|
42
|
+
pdf = PDF::Writer.new
|
43
|
+
pdf.text "Chunky", :font_size => 32, :justification => :center
|
44
|
+
pdf.start_new_page
|
45
|
+
pdf.text "Bacon", :font_size => 32, :justification => :center
|
46
|
+
pdf.save_as("chunkybacon.pdf")
|
47
|
+
|
48
|
+
# process the PDF
|
49
|
+
receiver = PageTextReceiver.new
|
50
|
+
PDF::Reader.file("chunkybacon.pdf", receiver)
|
51
|
+
|
52
|
+
# confirm the text appears on the correct pages
|
53
|
+
receiver.content.size.should eql(2)
|
54
|
+
receiver.content[0].should eql("Chunky")
|
55
|
+
receiver.content[1].should eql("Bacon")
|
56
|
+
end
|
57
|
+
end
|
data/examples/text.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Extract all text from a single PDF
|
5
|
+
|
6
|
+
class PageTextReceiver
|
7
|
+
attr_accessor :content
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@content = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# Called when page parsing starts
|
14
|
+
def begin_page(arg = nil)
|
15
|
+
@content << ""
|
16
|
+
end
|
17
|
+
|
18
|
+
# record text that is drawn on the page
|
19
|
+
def show_text(string, *params)
|
20
|
+
@content.last << string.strip
|
21
|
+
end
|
22
|
+
|
23
|
+
# there's a few text callbacks, so make sure we process them all
|
24
|
+
alias :super_show_text :show_text
|
25
|
+
alias :move_to_next_line_and_show_text :show_text
|
26
|
+
alias :set_spacing_next_line_show_text :show_text
|
27
|
+
|
28
|
+
# this final text callback takes slightly different arguments
|
29
|
+
def show_text_with_positioning(*params)
|
30
|
+
params = params.first
|
31
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
receiver = PageTextReceiver.new
|
36
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
37
|
+
puts receiver.content.inspect
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -147,18 +147,14 @@ class PDF::Reader
|
|
147
147
|
# - metadata
|
148
148
|
# - xml_metadata
|
149
149
|
# - page_count
|
150
|
+
# - begin_form_xobject
|
151
|
+
# - end_form_xobject
|
150
152
|
#
|
151
153
|
# == Resource Callbacks
|
152
154
|
#
|
153
|
-
# Each page
|
155
|
+
# Each page can contain (or inherit) a range of resources required for the page,
|
154
156
|
# including things like fonts and images. The following callbacks may appear
|
155
|
-
# after
|
156
|
-
# on a page:
|
157
|
-
#
|
158
|
-
# In most cases, these callbacks associate a name with each resource, allowing it
|
159
|
-
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
160
|
-
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
161
|
-
# invoke_xobject "IM1".
|
157
|
+
# after begin_page if the relevant resources exist on a page:
|
162
158
|
#
|
163
159
|
# - resource_procset
|
164
160
|
# - resource_xobject
|
@@ -166,6 +162,12 @@ class PDF::Reader
|
|
166
162
|
# - resource_colorspace
|
167
163
|
# - resource_pattern
|
168
164
|
# - resource_font
|
165
|
+
#
|
166
|
+
# In most cases, these callbacks associate a name with each resource, allowing it
|
167
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
168
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
169
|
+
# invoke_xobject "IM1".
|
170
|
+
#
|
169
171
|
class Content
|
170
172
|
OPERATORS = {
|
171
173
|
'b' => :close_fill_stroke,
|
@@ -284,22 +286,19 @@ class PDF::Reader
|
|
284
286
|
# its content
|
285
287
|
def walk_pages (page)
|
286
288
|
|
287
|
-
if page[:Resources]
|
288
|
-
res = page[:Resources]
|
289
|
-
page.delete(:Resources)
|
290
|
-
end
|
291
|
-
|
292
289
|
# extract page content
|
293
290
|
if page[:Type] == :Pages
|
294
291
|
callback(:begin_page_container, [page])
|
295
|
-
|
292
|
+
res = @xref.object(page[:Resources])
|
293
|
+
resources.push res if res
|
296
294
|
@xref.object(page[:Kids]).each {|child| walk_pages(@xref.object(child))}
|
295
|
+
resources.pop if res
|
297
296
|
callback(:end_page_container)
|
298
297
|
elsif page[:Type] == :Page
|
299
298
|
callback(:begin_page, [page])
|
300
|
-
|
301
|
-
|
302
|
-
|
299
|
+
res = @xref.object(page[:Resources])
|
300
|
+
resources.push res if res
|
301
|
+
walk_resources(current_resources)
|
303
302
|
|
304
303
|
if @xref.object(page[:Contents]).kind_of?(Array)
|
305
304
|
contents = @xref.object(page[:Contents])
|
@@ -312,10 +311,38 @@ class PDF::Reader
|
|
312
311
|
content_stream(obj)
|
313
312
|
end if page.has_key?(:Contents) and page[:Contents]
|
314
313
|
|
314
|
+
resources.pop if res
|
315
315
|
callback(:end_page)
|
316
316
|
end
|
317
317
|
end
|
318
318
|
################################################################################
|
319
|
+
# Retreive the XObject for the supplied label and if it's a Form, walk it
|
320
|
+
# like a regular page content stream.
|
321
|
+
#
|
322
|
+
def walk_xobject_form(label)
|
323
|
+
xobjects = current_resources[:XObject] || {}
|
324
|
+
xobject = @xref.object(xobjects[label])
|
325
|
+
|
326
|
+
if xobject && xobject.hash[:Subtype] == :Form
|
327
|
+
callback(:begin_form_xobject)
|
328
|
+
resources = @xref.object(xobject.hash[:Resources])
|
329
|
+
walk_resources(resources) if resources
|
330
|
+
content_stream(xobject.to_s)
|
331
|
+
callback(:end_form_xobject)
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
################################################################################
|
336
|
+
# Return a merged hash of all resources that are current. Pages, page and xobject
|
337
|
+
#
|
338
|
+
def current_resources
|
339
|
+
hash = {}
|
340
|
+
resources.each do |res|
|
341
|
+
hash.merge!(res)
|
342
|
+
end
|
343
|
+
hash
|
344
|
+
end
|
345
|
+
################################################################################
|
319
346
|
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
320
347
|
# it contains
|
321
348
|
def content_stream (instructions)
|
@@ -341,8 +368,16 @@ class PDF::Reader
|
|
341
368
|
# read the raw image data from the buffer without tokenising
|
342
369
|
@params << @buffer.read_until("EI")
|
343
370
|
end
|
371
|
+
|
344
372
|
callback(OPERATORS[token], @params)
|
345
|
-
|
373
|
+
|
374
|
+
if OPERATORS[token] == :invoke_xobject
|
375
|
+
xobject_label = @params.first
|
376
|
+
@params.clear
|
377
|
+
walk_xobject_form(xobject_label)
|
378
|
+
else
|
379
|
+
@params.clear
|
380
|
+
end
|
346
381
|
else
|
347
382
|
@params << token
|
348
383
|
end
|
@@ -352,6 +387,8 @@ class PDF::Reader
|
|
352
387
|
end
|
353
388
|
################################################################################
|
354
389
|
def walk_resources(resources)
|
390
|
+
return unless resources.respond_to?(:[])
|
391
|
+
|
355
392
|
resources = resolve_references(resources)
|
356
393
|
|
357
394
|
# extract any procset information
|
@@ -446,6 +483,9 @@ class PDF::Reader
|
|
446
483
|
obj
|
447
484
|
end
|
448
485
|
end
|
486
|
+
def resources
|
487
|
+
@resources ||= []
|
488
|
+
end
|
449
489
|
end
|
450
490
|
################################################################################
|
451
491
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-11 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -36,6 +36,13 @@ extra_rdoc_files:
|
|
36
36
|
- CHANGELOG
|
37
37
|
- MIT-LICENSE
|
38
38
|
files:
|
39
|
+
- examples/extract_bates.rb
|
40
|
+
- examples/text.rb
|
41
|
+
- examples/page_counter_naive.rb
|
42
|
+
- examples/callbacks.rb
|
43
|
+
- examples/metadata.rb
|
44
|
+
- examples/page_counter_improved.rb
|
45
|
+
- examples/rspec.rb
|
39
46
|
- lib/pdf/reader.rb
|
40
47
|
- lib/pdf/reader/buffer.rb
|
41
48
|
- lib/pdf/reader/cmap.rb
|
@@ -94,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
101
|
requirements: []
|
95
102
|
|
96
103
|
rubyforge_project: pdf-reader
|
97
|
-
rubygems_version: 1.3.
|
104
|
+
rubygems_version: 1.3.5
|
98
105
|
signing_key:
|
99
106
|
specification_version: 3
|
100
107
|
summary: A library for accessing the content of PDF files
|