pdf-reader 0.7.1 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ v0.7.2 (20th May 2008)
2
+ - Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
3
+ - Correctly handle page content instruction sets with trailing whitespace
4
+ - Represent PDF Streams with a new object, PDF::Reader::Stream
5
+ - their really wasn't any point in separating the stream content from it's associated dict. You need both
6
+ parts to correctly interpret the content
7
+
1
8
  v0.7.1 (6th May 2008)
2
9
  - Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
3
10
  - Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
data/README CHANGED
@@ -48,6 +48,9 @@ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't curren
48
48
  support. Again, we welcome submissions of PDF files that exhibit these features to help
49
49
  us with future code improvements.
50
50
 
51
+ Any other exceptions should be considered bugs and should be reported (unless they originate
52
+ inside your receiver, in which case you're on your own)
53
+
51
54
  = Maintainers
52
55
 
53
56
  - Peter Jones <mailto:pjones@pmade.com>
@@ -229,9 +232,9 @@ layout of the file, not the order objects are displayed to the user. As a
229
232
  consequence of this it is highly unlikely that text will be completely in
230
233
  order.
231
234
 
232
- Occasionally some text cannot be extracted properly due to the way it has been stored, or the use
233
- of invalid bytes. In these cases PDF::Reader will output a little UTF-8 friendly box to indicate
234
- an unrecognisable character.
235
+ Occasionally some text cannot be extracted properly due to the way it has been
236
+ stored, or the use of invalid bytes. In these cases PDF::Reader will output a
237
+ little UTF-8 friendly box to indicate an unrecognisable character.
235
238
 
236
239
  = Resources
237
240
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.7.1"
9
+ PKG_VERSION = "0.7.2"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -65,6 +65,9 @@ spec = Gem::Specification.new do |spec|
65
65
  ["Rakefile"]
66
66
 
67
67
  spec.require_path = "lib"
68
+ spec.bindir = "bin"
69
+ spec.executables << "pdf_text"
70
+ spec.executables << "pdf_list_callbacks"
68
71
  spec.has_rdoc = true
69
72
  spec.extra_rdoc_files = %w{README TODO CHANGELOG}
70
73
  spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
4
+
5
+ require 'pdf/reader'
6
+
7
+ receiver = PDF::Reader::RegisterReceiver.new
8
+
9
+ if ARGV.empty?
10
+ PDF::Reader.new.parse($stdin, receiver)
11
+ else
12
+ PDF::Reader.file(ARGV[0], receiver)
13
+ end
14
+
15
+ receiver.callbacks.each do |callback|
16
+ puts "#{callback[:name]} - #{callback[:args].inspect}"
17
+ end
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
4
+
5
+ require 'pdf/reader'
6
+
7
+ class PageTextReceiver
8
+ attr_accessor :content
9
+
10
+ # Called when page parsing starts
11
+ def end_page(arg = nil)
12
+ if @content
13
+ puts @content
14
+ puts
15
+ end
16
+ end
17
+
18
+ def show_text(string, *params)
19
+ @content = "" if @content.nil?
20
+ @content << string
21
+ end
22
+
23
+ # there's a few text callbacks, so make sure we process them all
24
+ alias :super_show_text :show_text
25
+ alias :move_to_next_line_and_show_text :show_text
26
+ alias :set_spacing_next_line_show_text :show_text
27
+
28
+ def show_text_with_positioning(*params)
29
+ params = params.first
30
+ params.each { |str| show_text(str) if str.kind_of?(String)}
31
+ end
32
+ end
33
+
34
+ receiver = PageTextReceiver.new
35
+
36
+ if ARGV.empty?
37
+ PDF::Reader.new.parse($stdin, receiver)
38
+ else
39
+ PDF::Reader.file(ARGV[0], receiver)
40
+ end
@@ -98,6 +98,7 @@ require 'pdf/reader/font'
98
98
  require 'pdf/reader/parser'
99
99
  require 'pdf/reader/reference'
100
100
  require 'pdf/reader/register_receiver'
101
+ require 'pdf/reader/stream'
101
102
  require 'pdf/reader/text_receiver'
102
103
  require 'pdf/reader/token'
103
104
  require 'pdf/reader/xref'
@@ -119,8 +120,9 @@ class PDF::Reader
119
120
  options.merge!(opts)
120
121
 
121
122
  trailer = @xref.load
122
- @content.metadata(@xref.object(trailer[:Info]).first) if options[:metadata]
123
- @content.document(@xref.object(trailer[:Root]).first) if options[:pages]
123
+ raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
124
+ @content.metadata(@xref.object(trailer[:Info])) if options[:metadata]
125
+ @content.document(@xref.object(trailer[:Root])) if options[:pages]
124
126
  self
125
127
  end
126
128
  ################################################################################
@@ -77,6 +77,7 @@ class PDF::Reader
77
77
  # returns true if the underlying IO object is at end and the internal buffer
78
78
  # is empty
79
79
  def eof?
80
+ ready_token
80
81
  if @buffer
81
82
  @buffer.empty? && @io.eof?
82
83
  else
@@ -91,7 +92,7 @@ class PDF::Reader
91
92
  # PDF files are processed by tokenising the content into a series of objects and commands.
92
93
  # This prepares the buffer for use by reading the next line of tokens into memory.
93
94
  def ready_token (with_strip=true, skip_blanks=true)
94
- while @buffer.nil? or @buffer.empty?
95
+ while (@buffer.nil? or @buffer.empty?) && !@io.eof?
95
96
  @buffer = @io.readline
96
97
  @buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
97
98
  #@buffer.sub!(/%.*$/, '') if strip_comments
@@ -260,11 +260,10 @@ class PDF::Reader
260
260
  # Begin processing the document
261
261
  def document (root)
262
262
  if root[:Metadata]
263
- obj, stream = @xref.object(root[:Metadata])
264
- callback(:xml_metadata,stream)
263
+ callback(:xml_metadata,@xref.object(root[:Metadata]))
265
264
  end
266
265
  callback(:begin_document, [root])
267
- walk_pages(@xref.object(root[:Pages]).first)
266
+ walk_pages(@xref.object(root[:Pages]))
268
267
  callback(:end_document)
269
268
  end
270
269
  ################################################################################
@@ -280,26 +279,24 @@ class PDF::Reader
280
279
  # extract page content
281
280
  if page[:Type] == :Pages
282
281
  callback(:begin_page_container, [page])
283
- walk_resources(@xref.object(res).first) if res
284
- page[:Kids].each {|child| walk_pages(@xref.object(child).first)}
282
+ walk_resources(@xref.object(res)) if res
283
+ page[:Kids].each {|child| walk_pages(@xref.object(child))}
285
284
  callback(:end_page_container)
286
285
  elsif page[:Type] == :Page
287
286
  callback(:begin_page, [page])
288
- walk_resources(@xref.object(res).first) if res
287
+ walk_resources(@xref.object(res)) if res
289
288
  @page = page
290
289
  @params = []
291
290
 
292
- if page[:Contents].kind_of?(Array)
293
- contents = page[:Contents]
294
- elsif @xref.obj_type(page[:Contents]) == :Array
295
- contents, stream = @xref.object(page[:Contents])
291
+ if @xref.object(page[:Contents]).kind_of?(Array)
292
+ contents = @xref.object(page[:Contents])
296
293
  else
297
294
  contents = [page[:Contents]]
298
295
  end
299
296
 
300
297
  contents.each do |content|
301
- obj, stream = @xref.object(content)
302
- content_stream(stream)
298
+ obj = @xref.object(content)
299
+ content_stream(obj)
303
300
  end if page.has_key?(:Contents) and page[:Contents]
304
301
 
305
302
  callback(:end_page)
@@ -356,42 +353,41 @@ class PDF::Reader
356
353
 
357
354
  # extract any xobject information
358
355
  if resources[:XObject]
359
- @xref.object(resources[:XObject]).first.each do |name, val|
360
- obj, stream = @xref.object(val)
361
- callback(:resource_xobject, [name, obj, stream])
356
+ @xref.object(resources[:XObject]).each do |name, val|
357
+ callback(:resource_xobject, [name, @xref.object(val)])
362
358
  end
363
359
  end
364
360
 
365
361
  # extract any extgstate information
366
362
  if resources[:ExtGState]
367
- @xref.object(resources[:ExtGState]).first.each do |name, val|
368
- callback(:resource_extgstate, [name, @xref.object(val).first])
363
+ @xref.object(resources[:ExtGState]).each do |name, val|
364
+ callback(:resource_extgstate, [name, @xref.object(val)])
369
365
  end
370
366
  end
371
367
 
372
368
  # extract any colorspace information
373
369
  if resources[:ColorSpace]
374
- @xref.object(resources[:ColorSpace]).first.each do |name, val|
375
- callback(:resource_colorspace, [name, @xref.object(val).first])
370
+ @xref.object(resources[:ColorSpace]).each do |name, val|
371
+ callback(:resource_colorspace, [name, @xref.object(val)])
376
372
  end
377
373
  end
378
374
 
379
375
  # extract any pattern information
380
376
  if resources[:Pattern]
381
- @xref.object(resources[:Pattern]).first.each do |name, val|
382
- callback(:resource_pattern, [name, @xref.object(val).first])
377
+ @xref.object(resources[:Pattern]).each do |name, val|
378
+ callback(:resource_pattern, [name, @xref.object(val)])
383
379
  end
384
380
  end
385
381
 
386
382
  # extract any font information
387
383
  if resources[:Font]
388
- @xref.object(resources[:Font]).first.each do |label, desc|
389
- desc = @xref.object(desc).first
384
+ @xref.object(resources[:Font]).each do |label, desc|
385
+ desc = @xref.object(desc)
390
386
  @fonts[label] = PDF::Reader::Font.new
391
387
  @fonts[label].label = label
392
388
  @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
393
389
  @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
394
- @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]).first)
390
+ @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]))
395
391
  @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
396
392
  if desc[:ToUnicode]
397
393
  # this stream is a cmap
@@ -409,13 +405,11 @@ class PDF::Reader
409
405
  # Convert any PDF::Reader::Resource objects into a real object
410
406
  def resolve_references(obj)
411
407
  case obj
408
+ when PDF::Reader::Stream then
409
+ obj.hash = resolve_references(obj.hash)
410
+ obj
412
411
  when PDF::Reader::Reference then
413
- obj, stream = @xref.object(obj)
414
- if stream
415
- stream
416
- else
417
- resolve_references(obj)
418
- end
412
+ resolve_references(@xref.object(obj))
419
413
  when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
420
414
  when Array then obj.collect { |item| resolve_references(item) }
421
415
  else
@@ -174,8 +174,8 @@ class PDF::Reader
174
174
  obj = parse_token
175
175
  post_obj = parse_token
176
176
  case post_obj
177
- when "endobj" then return [obj,nil]
178
- when "stream" then return [obj, stream(obj)]
177
+ when "endobj" then return obj
178
+ when "stream" then return stream(obj)
179
179
  else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
180
180
  end
181
181
  end
@@ -183,7 +183,7 @@ class PDF::Reader
183
183
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
184
184
  def stream (dict)
185
185
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
186
- data = @buffer.read(@xref.object(dict[:Length]).first)
186
+ data = @buffer.read(@xref.object(dict[:Length]))
187
187
 
188
188
  Error.str_assert(parse_token, "endstream")
189
189
  Error.str_assert(parse_token, "endobj")
@@ -200,7 +200,7 @@ class PDF::Reader
200
200
  end
201
201
  end
202
202
 
203
- data
203
+ PDF::Reader::Stream.new(dict, data)
204
204
  end
205
205
  ################################################################################
206
206
  end
@@ -0,0 +1,43 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that represents a single token from a PDF file.
29
+ #
30
+ # Behaves exactly like a Ruby String - it basically exists for convenience.
31
+ class Stream < String
32
+ attr_accessor :hash
33
+ ################################################################################
34
+ # Creates a new token with the specified value
35
+ def initialize (hash, val)
36
+ @hash = hash
37
+ super val
38
+ end
39
+ ################################################################################
40
+ end
41
+ ################################################################################
42
+ end
43
+ ################################################################################
@@ -61,15 +61,11 @@ class PDF::Reader
61
61
  #
62
62
  # If the object is a stream, that is returned as well
63
63
  def object (ref, save_pos = true)
64
- return ref, nil unless ref.kind_of?(Reference)
64
+ return ref unless ref.kind_of?(Reference)
65
65
  pos = @buffer.pos if save_pos
66
- obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
66
+ obj = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
67
67
  @buffer.seek(pos) if save_pos
68
- if stream
69
- return [obj, stream]
70
- else
71
- return [obj, nil]
72
- end
68
+ return obj
73
69
  end
74
70
  ################################################################################
75
71
  # Assumes the underlying buffer is positioned at the start of an Xref table and
@@ -112,7 +108,7 @@ class PDF::Reader
112
108
  end
113
109
  # returns the type of object a ref points to
114
110
  def obj_type(ref)
115
- obj, stream = object(ref)
111
+ obj = object(ref)
116
112
  obj.class.to_s.to_sym
117
113
  end
118
114
  # returns true if the supplied references points to an object with a stream
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones
@@ -9,14 +9,15 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-05-06 00:00:00 +10:00
12
+ date: 2008-05-20 00:00:00 +10:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
16
16
  description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
17
17
  email: pjones@pmade.com
18
- executables: []
19
-
18
+ executables:
19
+ - pdf_text
20
+ - pdf_list_callbacks
20
21
  extensions: []
21
22
 
22
23
  extra_rdoc_files:
@@ -41,6 +42,7 @@ files:
41
42
  - lib/pdf/reader/register_receiver.rb
42
43
  - lib/pdf/reader/font.rb
43
44
  - lib/pdf/reader/glyphlist.txt
45
+ - lib/pdf/reader/stream.rb
44
46
  - lib/pdf/reader/parser.rb.rej
45
47
  - lib/pdf/reader.rb
46
48
  - Rakefile
@@ -73,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
73
75
  requirements: []
74
76
 
75
77
  rubyforge_project: pdf-reader
76
- rubygems_version: 1.0.1
78
+ rubygems_version: 1.1.1
77
79
  signing_key:
78
80
  specification_version: 2
79
81
  summary: A library for accessing the content of PDF files