pdf-reader 0.7.6 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,8 @@
1
+ v0.7.7 (11th September 2009)
2
+ - Trigger callbacks contained in Form XObjects when we encounter them in a
3
+ content stream
4
+ - Fix inheritance of page resources to comply with section 3.6.2
5
+
1
6
  v0.7.6 (28th August 2009)
2
7
  - Various bug fixes that increase the files we can successfully parse
3
8
  - Treat float and integer tokens differently (thanks Neil)
@@ -88,177 +88,7 @@ http://groups.google.com/group/pdf-reader
88
88
  = Examples
89
89
 
90
90
  The easiest way to explain how this works in practice is to show some examples.
91
-
92
- == Naïve Page Counter
93
-
94
- A simple app to count the number of pages in a PDF File.
95
-
96
- require 'rubygems'
97
- require 'pdf/reader'
98
-
99
- class PageReceiver
100
- attr_accessor :counter
101
-
102
- def initialize
103
- @counter = 0
104
- end
105
-
106
- # Called when page parsing ends
107
- def end_page
108
- @counter += 1
109
- end
110
- end
111
-
112
- receiver = PageReceiver.new
113
- pdf = PDF::Reader.file("somefile.pdf", receiver)
114
- puts "#{receiver.counter} pages"
115
-
116
- == List all callbacks generated by a single PDF
117
-
118
- WARNING: this will generate a *lot* of output, so you probably want to pipe
119
- it through less or to a text file.
120
-
121
- require 'rubygems'
122
- require 'pdf/reader'
123
-
124
- receiver = PDF::Reader::RegisterReceiver.new
125
- pdf = PDF::Reader.file("somefile.pdf", receiver)
126
- receiver.callbacks.each do |cb|
127
- puts cb
128
- end
129
-
130
- == Extract all text from a single PDF
131
-
132
- class PageTextReceiver
133
- attr_accessor :content
134
-
135
- def initialize
136
- @content = []
137
- end
138
-
139
- # Called when page parsing starts
140
- def begin_page(arg = nil)
141
- @content << ""
142
- end
143
-
144
- # record text that is drawn on the page
145
- def show_text(string, *params)
146
- @content.last << string.strip
147
- end
148
-
149
- # there's a few text callbacks, so make sure we process them all
150
- alias :super_show_text :show_text
151
- alias :move_to_next_line_and_show_text :show_text
152
- alias :set_spacing_next_line_show_text :show_text
153
-
154
- # this final text callback takes slightly different arguments
155
- def show_text_with_positioning(*params)
156
- params = params.first
157
- params.each { |str| show_text(str) if str.kind_of?(String)}
158
- end
159
- end
160
-
161
- receiver = PageTextReceiver.new
162
- pdf = PDF::Reader.file("somefile.pdf", receiver)
163
- puts receiver.content.inspect
164
-
165
- == Extract metadata only
166
-
167
- require 'rubygems'
168
- require 'pdf/reader'
169
-
170
- class MetaDataReceiver
171
- attr_accessor :regular
172
- attr_accessor :xml
173
-
174
- def metadata(data)
175
- @regular = data
176
- end
177
-
178
- def metadata_xml(data)
179
- @xml = data
180
- end
181
- end
182
-
183
- receiver = MetaDataReceiver.new
184
- pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
185
- puts receiver.regular.inspect
186
- puts receiver.xml.inspect
187
-
188
- == Improved Page Counter
189
-
190
- A simple app to display the number of pages in a PDF File.
191
-
192
- require 'rubygems'
193
- require 'pdf/reader'
194
-
195
- class PageReceiver
196
- attr_accessor :pages
197
-
198
- # Called when page parsing ends
199
- def page_count(arg)
200
- @pages = arg
201
- end
202
- end
203
-
204
- receiver = PageReceiver.new
205
- pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
206
- puts "#{receiver.pages} pages"
207
-
208
- == Basic RSpec of a generated PDF
209
-
210
- require 'rubygems'
211
- require 'pdf/reader'
212
- require 'pdf/writer'
213
- require 'spec'
214
-
215
- class PageTextReceiver
216
- attr_accessor :content
217
-
218
- def initialize
219
- @content = []
220
- end
221
-
222
- # Called when page parsing starts
223
- def begin_page(arg = nil)
224
- @content << ""
225
- end
226
-
227
- def show_text(string, *params)
228
- @content.last << string.strip
229
- end
230
-
231
- # there's a few text callbacks, so make sure we process them all
232
- alias :super_show_text :show_text
233
- alias :move_to_next_line_and_show_text :show_text
234
- alias :set_spacing_next_line_show_text :show_text
235
-
236
- def show_text_with_positioning(*params)
237
- params = params.first
238
- params.each { |str| show_text(str) if str.kind_of?(String)}
239
- end
240
- end
241
-
242
- context "My generated PDF" do
243
- specify "should have the correct text on 2 pages" do
244
-
245
- # generate our PDF
246
- pdf = PDF::Writer.new
247
- pdf.text "Chunky", :font_size => 32, :justification => :center
248
- pdf.start_new_page
249
- pdf.text "Bacon", :font_size => 32, :justification => :center
250
- pdf.save_as("chunkybacon.pdf")
251
-
252
- # process the PDF
253
- receiver = PageTextReceiver.new
254
- PDF::Reader.file("chunkybacon.pdf", receiver)
255
-
256
- # confirm the text appears on the correct pages
257
- receiver.content.size.should eql(2)
258
- receiver.content[0].should eql("Chunky")
259
- receiver.content[1].should eql("Bacon")
260
- end
261
- end
91
+ Check out the examples/ directory for a few files.
262
92
 
263
93
  = Known Limitations
264
94
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.7.6"
9
+ PKG_VERSION = "0.7.7"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
data/TODO CHANGED
@@ -16,6 +16,8 @@ v0.8
16
16
  - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
17
17
  - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
18
18
  - Support Cross Reference Streams (spec 3.4.7)
19
+ - Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
20
+ are inheritable. See table 3.2.7 in the spec
19
21
 
20
22
  v0.9
21
23
  - Add a way to extract raster images
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # List all callbacks generated by a single PDF
5
+ #
6
+ # WARNING: this will generate a *lot* of output, so you probably want to pipe
7
+ # it through less or to a text file.
8
+
9
+ require 'rubygems'
10
+ require 'pdf/reader'
11
+
12
+ receiver = PDF::Reader::RegisterReceiver.new
13
+ pdf = PDF::Reader.file("somefile.pdf", receiver)
14
+ receiver.callbacks.each do |cb|
15
+ puts cb
16
+ end
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # A sample script that attempts to extract bates numbers from a PDF file.
5
+ # Bates numbers are often used to markup documents being used in legal
6
+ # cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
7
+ #
8
+ # Acrobat 9 introduced a markup syntax that directly specifies the bates
9
+ # number for each page. For earlier versions, the easiest way to find
10
+ # the number is to look for words that match a pattern.
11
+ #
12
+ # This example attempts to extract numbers using the Acrobat 9 syntax.
13
+ # As a fall back, you can provide a regular expression that will be
14
+ # used to look for words that look like the numbers you expect in the
15
+ # page content.
16
+
17
+ require 'rubygems'
18
+ require 'pdf/reader'
19
+
20
+ class BatesReceiver
21
+
22
+ def initialize(regexp = nil)
23
+ @numbers = []
24
+ @backup = []
25
+ @regexp = regexp
26
+ end
27
+
28
+ def numbers
29
+ @numbers.size > 0 ? @numbers : @backup
30
+ end
31
+
32
+ # Called when page parsing starts
33
+ def begin_marked_content(*args)
34
+ return unless args.size >= 2
35
+ return unless args.first == :Artifact
36
+ return unless args[1][:Subtype] == :BatesN
37
+
38
+ @numbers << args[1][:Contents]
39
+ end
40
+ alias :begin_marked_content_with_pl :begin_marked_content
41
+
42
+ # record text that is drawn on the page
43
+ def show_text(string, *params)
44
+ return if @regexp.nil?
45
+
46
+ string.scan(@regexp).each { |m| @backup << m }
47
+ end
48
+
49
+ # there's a few text callbacks, so make sure we process them all
50
+ alias :super_show_text :show_text
51
+ alias :move_to_next_line_and_show_text :show_text
52
+ alias :set_spacing_next_line_show_text :show_text
53
+
54
+ # this final text callback takes slightly different arguments
55
+ def show_text_with_positioning(*params)
56
+ params = params.first
57
+ params.each { |str| show_text(str) if str.kind_of?(String)}
58
+ end
59
+ end
60
+
61
+ receiver = BatesReceiver.new(/CC.+/)
62
+ PDF::Reader.file("bates.pdf", receiver)
63
+ puts receiver.numbers.inspect
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # coding: utf-8
4
+ # Extract metadata only
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class MetaDataReceiver
10
+ attr_accessor :regular
11
+ attr_accessor :xml
12
+
13
+ def metadata(data)
14
+ @regular = data
15
+ end
16
+
17
+ def metadata_xml(data)
18
+ @xml = data
19
+ end
20
+ end
21
+
22
+ receiver = MetaDataReceiver.new
23
+ pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
24
+ puts receiver.regular.inspect
25
+ puts receiver.xml.inspect
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Improved Page Counter
5
+ #
6
+ # A simple app to display the number of pages in a PDF File.
7
+ #
8
+
9
+ require 'rubygems'
10
+ require 'pdf/reader'
11
+
12
+ class PageReceiver
13
+ attr_accessor :pages
14
+
15
+ # Called when page parsing ends
16
+ def page_count(arg)
17
+ @pages = arg
18
+ end
19
+ end
20
+
21
+ receiver = PageReceiver.new
22
+ pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
23
+ puts "#{receiver.pages} pages"
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # A simple app to count the number of pages in a PDF File.
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class PageReceiver
10
+ attr_accessor :counter
11
+
12
+ def initialize
13
+ @counter = 0
14
+ end
15
+
16
+ # Called when page parsing ends
17
+ def end_page
18
+ @counter += 1
19
+ end
20
+ end
21
+
22
+ receiver = PageReceiver.new
23
+ pdf = PDF::Reader.file("somefile.pdf", receiver)
24
+ puts "#{receiver.counter} pages"
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Basic RSpec of a generated PDF
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+ require 'pdf/writer'
9
+ require 'spec'
10
+
11
+ class PageTextReceiver
12
+ attr_accessor :content
13
+
14
+ def initialize
15
+ @content = []
16
+ end
17
+
18
+ # Called when page parsing starts
19
+ def begin_page(arg = nil)
20
+ @content << ""
21
+ end
22
+
23
+ def show_text(string, *params)
24
+ @content.last << string.strip
25
+ end
26
+
27
+ # there's a few text callbacks, so make sure we process them all
28
+ alias :super_show_text :show_text
29
+ alias :move_to_next_line_and_show_text :show_text
30
+ alias :set_spacing_next_line_show_text :show_text
31
+
32
+ def show_text_with_positioning(*params)
33
+ params = params.first
34
+ params.each { |str| show_text(str) if str.kind_of?(String)}
35
+ end
36
+ end
37
+
38
+ context "My generated PDF" do
39
+ specify "should have the correct text on 2 pages" do
40
+
41
+ # generate our PDF
42
+ pdf = PDF::Writer.new
43
+ pdf.text "Chunky", :font_size => 32, :justification => :center
44
+ pdf.start_new_page
45
+ pdf.text "Bacon", :font_size => 32, :justification => :center
46
+ pdf.save_as("chunkybacon.pdf")
47
+
48
+ # process the PDF
49
+ receiver = PageTextReceiver.new
50
+ PDF::Reader.file("chunkybacon.pdf", receiver)
51
+
52
+ # confirm the text appears on the correct pages
53
+ receiver.content.size.should eql(2)
54
+ receiver.content[0].should eql("Chunky")
55
+ receiver.content[1].should eql("Bacon")
56
+ end
57
+ end
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Extract all text from a single PDF
5
+
6
+ class PageTextReceiver
7
+ attr_accessor :content
8
+
9
+ def initialize
10
+ @content = []
11
+ end
12
+
13
+ # Called when page parsing starts
14
+ def begin_page(arg = nil)
15
+ @content << ""
16
+ end
17
+
18
+ # record text that is drawn on the page
19
+ def show_text(string, *params)
20
+ @content.last << string.strip
21
+ end
22
+
23
+ # there's a few text callbacks, so make sure we process them all
24
+ alias :super_show_text :show_text
25
+ alias :move_to_next_line_and_show_text :show_text
26
+ alias :set_spacing_next_line_show_text :show_text
27
+
28
+ # this final text callback takes slightly different arguments
29
+ def show_text_with_positioning(*params)
30
+ params = params.first
31
+ params.each { |str| show_text(str) if str.kind_of?(String)}
32
+ end
33
+ end
34
+
35
+ receiver = PageTextReceiver.new
36
+ pdf = PDF::Reader.file("somefile.pdf", receiver)
37
+ puts receiver.content.inspect
@@ -147,18 +147,14 @@ class PDF::Reader
147
147
  # - metadata
148
148
  # - xml_metadata
149
149
  # - page_count
150
+ # - begin_form_xobject
151
+ # - end_form_xobject
150
152
  #
151
153
  # == Resource Callbacks
152
154
  #
153
- # Each page and page_container can contain a range of resources required for the page,
155
+ # Each page can contain (or inherit) a range of resources required for the page,
154
156
  # including things like fonts and images. The following callbacks may appear
155
- # after begin_page_container and begin_page if the relevant resources exist
156
- # on a page:
157
- #
158
- # In most cases, these callbacks associate a name with each resource, allowing it
159
- # to be referred to by name in the page content. For example, an XObject can hold an image.
160
- # If it gets mapped to the name "IM1", then it can be placed on the page using
161
- # invoke_xobject "IM1".
157
+ # after begin_page if the relevant resources exist on a page:
162
158
  #
163
159
  # - resource_procset
164
160
  # - resource_xobject
@@ -166,6 +162,12 @@ class PDF::Reader
166
162
  # - resource_colorspace
167
163
  # - resource_pattern
168
164
  # - resource_font
165
+ #
166
+ # In most cases, these callbacks associate a name with each resource, allowing it
167
+ # to be referred to by name in the page content. For example, an XObject can hold an image.
168
+ # If it gets mapped to the name "IM1", then it can be placed on the page using
169
+ # invoke_xobject "IM1".
170
+ #
169
171
  class Content
170
172
  OPERATORS = {
171
173
  'b' => :close_fill_stroke,
@@ -284,22 +286,19 @@ class PDF::Reader
284
286
  # its content
285
287
  def walk_pages (page)
286
288
 
287
- if page[:Resources]
288
- res = page[:Resources]
289
- page.delete(:Resources)
290
- end
291
-
292
289
  # extract page content
293
290
  if page[:Type] == :Pages
294
291
  callback(:begin_page_container, [page])
295
- walk_resources(@xref.object(res)) if res
292
+ res = @xref.object(page[:Resources])
293
+ resources.push res if res
296
294
  @xref.object(page[:Kids]).each {|child| walk_pages(@xref.object(child))}
295
+ resources.pop if res
297
296
  callback(:end_page_container)
298
297
  elsif page[:Type] == :Page
299
298
  callback(:begin_page, [page])
300
- walk_resources(@xref.object(res)) if res
301
- @page = page
302
- @params = []
299
+ res = @xref.object(page[:Resources])
300
+ resources.push res if res
301
+ walk_resources(current_resources)
303
302
 
304
303
  if @xref.object(page[:Contents]).kind_of?(Array)
305
304
  contents = @xref.object(page[:Contents])
@@ -312,10 +311,38 @@ class PDF::Reader
312
311
  content_stream(obj)
313
312
  end if page.has_key?(:Contents) and page[:Contents]
314
313
 
314
+ resources.pop if res
315
315
  callback(:end_page)
316
316
  end
317
317
  end
318
318
  ################################################################################
319
+ # Retreive the XObject for the supplied label and if it's a Form, walk it
320
+ # like a regular page content stream.
321
+ #
322
+ def walk_xobject_form(label)
323
+ xobjects = current_resources[:XObject] || {}
324
+ xobject = @xref.object(xobjects[label])
325
+
326
+ if xobject && xobject.hash[:Subtype] == :Form
327
+ callback(:begin_form_xobject)
328
+ resources = @xref.object(xobject.hash[:Resources])
329
+ walk_resources(resources) if resources
330
+ content_stream(xobject.to_s)
331
+ callback(:end_form_xobject)
332
+ end
333
+ end
334
+
335
+ ################################################################################
336
+ # Return a merged hash of all resources that are current. Pages, page and xobject
337
+ #
338
+ def current_resources
339
+ hash = {}
340
+ resources.each do |res|
341
+ hash.merge!(res)
342
+ end
343
+ hash
344
+ end
345
+ ################################################################################
319
346
  # Reads a PDF content stream and calls all the appropriate callback methods for the operators
320
347
  # it contains
321
348
  def content_stream (instructions)
@@ -341,8 +368,16 @@ class PDF::Reader
341
368
  # read the raw image data from the buffer without tokenising
342
369
  @params << @buffer.read_until("EI")
343
370
  end
371
+
344
372
  callback(OPERATORS[token], @params)
345
- @params.clear
373
+
374
+ if OPERATORS[token] == :invoke_xobject
375
+ xobject_label = @params.first
376
+ @params.clear
377
+ walk_xobject_form(xobject_label)
378
+ else
379
+ @params.clear
380
+ end
346
381
  else
347
382
  @params << token
348
383
  end
@@ -352,6 +387,8 @@ class PDF::Reader
352
387
  end
353
388
  ################################################################################
354
389
  def walk_resources(resources)
390
+ return unless resources.respond_to?(:[])
391
+
355
392
  resources = resolve_references(resources)
356
393
 
357
394
  # extract any procset information
@@ -446,6 +483,9 @@ class PDF::Reader
446
483
  obj
447
484
  end
448
485
  end
486
+ def resources
487
+ @resources ||= []
488
+ end
449
489
  end
450
490
  ################################################################################
451
491
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.6
4
+ version: 0.7.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-28 00:00:00 +10:00
12
+ date: 2009-09-11 00:00:00 +10:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -36,6 +36,13 @@ extra_rdoc_files:
36
36
  - CHANGELOG
37
37
  - MIT-LICENSE
38
38
  files:
39
+ - examples/extract_bates.rb
40
+ - examples/text.rb
41
+ - examples/page_counter_naive.rb
42
+ - examples/callbacks.rb
43
+ - examples/metadata.rb
44
+ - examples/page_counter_improved.rb
45
+ - examples/rspec.rb
39
46
  - lib/pdf/reader.rb
40
47
  - lib/pdf/reader/buffer.rb
41
48
  - lib/pdf/reader/cmap.rb
@@ -94,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
101
  requirements: []
95
102
 
96
103
  rubyforge_project: pdf-reader
97
- rubygems_version: 1.3.4
104
+ rubygems_version: 1.3.5
98
105
  signing_key:
99
106
  specification_version: 3
100
107
  summary: A library for accessing the content of PDF files