pdf-reader 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +7 -0
- data/README +6 -3
- data/Rakefile +4 -1
- data/bin/pdf_list_callbacks +17 -0
- data/bin/pdf_text +40 -0
- data/lib/pdf/reader.rb +4 -2
- data/lib/pdf/reader/buffer.rb +2 -1
- data/lib/pdf/reader/content.rb +24 -30
- data/lib/pdf/reader/parser.rb +4 -4
- data/lib/pdf/reader/stream.rb +43 -0
- data/lib/pdf/reader/xref.rb +4 -8
- metadata +7 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v0.7.2 (20th May 2008)
|
2
|
+
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
3
|
+
- Correctly handle page content instruction sets with trailing whitespace
|
4
|
+
- Represent PDF Streams with a new object, PDF::Reader::Stream
|
5
|
+
- their really wasn't any point in separating the stream content from it's associated dict. You need both
|
6
|
+
parts to correctly interpret the content
|
7
|
+
|
1
8
|
v0.7.1 (6th May 2008)
|
2
9
|
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
3
10
|
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
data/README
CHANGED
@@ -48,6 +48,9 @@ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't curren
|
|
48
48
|
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
49
49
|
us with future code improvements.
|
50
50
|
|
51
|
+
Any other exceptions should be considered bugs and should be reported (unless they originate
|
52
|
+
inside your receiver, in which case you're on your own)
|
53
|
+
|
51
54
|
= Maintainers
|
52
55
|
|
53
56
|
- Peter Jones <mailto:pjones@pmade.com>
|
@@ -229,9 +232,9 @@ layout of the file, not the order objects are displayed to the user. As a
|
|
229
232
|
consequence of this it is highly unlikely that text will be completely in
|
230
233
|
order.
|
231
234
|
|
232
|
-
Occasionally some text cannot be extracted properly due to the way it has been
|
233
|
-
of invalid bytes. In these cases PDF::Reader will output a
|
234
|
-
an unrecognisable character.
|
235
|
+
Occasionally some text cannot be extracted properly due to the way it has been
|
236
|
+
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
237
|
+
little UTF-8 friendly box to indicate an unrecognisable character.
|
235
238
|
|
236
239
|
= Resources
|
237
240
|
|
data/Rakefile
CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
|
|
6
6
|
require "rake/gempackagetask"
|
7
7
|
require 'spec/rake/spectask'
|
8
8
|
|
9
|
-
PKG_VERSION = "0.7.
|
9
|
+
PKG_VERSION = "0.7.2"
|
10
10
|
PKG_NAME = "pdf-reader"
|
11
11
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
12
|
|
@@ -65,6 +65,9 @@ spec = Gem::Specification.new do |spec|
|
|
65
65
|
["Rakefile"]
|
66
66
|
|
67
67
|
spec.require_path = "lib"
|
68
|
+
spec.bindir = "bin"
|
69
|
+
spec.executables << "pdf_text"
|
70
|
+
spec.executables << "pdf_list_callbacks"
|
68
71
|
spec.has_rdoc = true
|
69
72
|
spec.extra_rdoc_files = %w{README TODO CHANGELOG}
|
70
73
|
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
4
|
+
|
5
|
+
require 'pdf/reader'
|
6
|
+
|
7
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
8
|
+
|
9
|
+
if ARGV.empty?
|
10
|
+
PDF::Reader.new.parse($stdin, receiver)
|
11
|
+
else
|
12
|
+
PDF::Reader.file(ARGV[0], receiver)
|
13
|
+
end
|
14
|
+
|
15
|
+
receiver.callbacks.each do |callback|
|
16
|
+
puts "#{callback[:name]} - #{callback[:args].inspect}"
|
17
|
+
end
|
data/bin/pdf_text
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
4
|
+
|
5
|
+
require 'pdf/reader'
|
6
|
+
|
7
|
+
class PageTextReceiver
|
8
|
+
attr_accessor :content
|
9
|
+
|
10
|
+
# Called when page parsing starts
|
11
|
+
def end_page(arg = nil)
|
12
|
+
if @content
|
13
|
+
puts @content
|
14
|
+
puts
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def show_text(string, *params)
|
19
|
+
@content = "" if @content.nil?
|
20
|
+
@content << string
|
21
|
+
end
|
22
|
+
|
23
|
+
# there's a few text callbacks, so make sure we process them all
|
24
|
+
alias :super_show_text :show_text
|
25
|
+
alias :move_to_next_line_and_show_text :show_text
|
26
|
+
alias :set_spacing_next_line_show_text :show_text
|
27
|
+
|
28
|
+
def show_text_with_positioning(*params)
|
29
|
+
params = params.first
|
30
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
receiver = PageTextReceiver.new
|
35
|
+
|
36
|
+
if ARGV.empty?
|
37
|
+
PDF::Reader.new.parse($stdin, receiver)
|
38
|
+
else
|
39
|
+
PDF::Reader.file(ARGV[0], receiver)
|
40
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -98,6 +98,7 @@ require 'pdf/reader/font'
|
|
98
98
|
require 'pdf/reader/parser'
|
99
99
|
require 'pdf/reader/reference'
|
100
100
|
require 'pdf/reader/register_receiver'
|
101
|
+
require 'pdf/reader/stream'
|
101
102
|
require 'pdf/reader/text_receiver'
|
102
103
|
require 'pdf/reader/token'
|
103
104
|
require 'pdf/reader/xref'
|
@@ -119,8 +120,9 @@ class PDF::Reader
|
|
119
120
|
options.merge!(opts)
|
120
121
|
|
121
122
|
trailer = @xref.load
|
122
|
-
|
123
|
-
@content.
|
123
|
+
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
|
124
|
+
@content.metadata(@xref.object(trailer[:Info])) if options[:metadata]
|
125
|
+
@content.document(@xref.object(trailer[:Root])) if options[:pages]
|
124
126
|
self
|
125
127
|
end
|
126
128
|
################################################################################
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -77,6 +77,7 @@ class PDF::Reader
|
|
77
77
|
# returns true if the underlying IO object is at end and the internal buffer
|
78
78
|
# is empty
|
79
79
|
def eof?
|
80
|
+
ready_token
|
80
81
|
if @buffer
|
81
82
|
@buffer.empty? && @io.eof?
|
82
83
|
else
|
@@ -91,7 +92,7 @@ class PDF::Reader
|
|
91
92
|
# PDF files are processed by tokenising the content into a series of objects and commands.
|
92
93
|
# This prepares the buffer for use by reading the next line of tokens into memory.
|
93
94
|
def ready_token (with_strip=true, skip_blanks=true)
|
94
|
-
while @buffer.nil? or @buffer.empty?
|
95
|
+
while (@buffer.nil? or @buffer.empty?) && !@io.eof?
|
95
96
|
@buffer = @io.readline
|
96
97
|
@buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
|
97
98
|
#@buffer.sub!(/%.*$/, '') if strip_comments
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -260,11 +260,10 @@ class PDF::Reader
|
|
260
260
|
# Begin processing the document
|
261
261
|
def document (root)
|
262
262
|
if root[:Metadata]
|
263
|
-
|
264
|
-
callback(:xml_metadata,stream)
|
263
|
+
callback(:xml_metadata,@xref.object(root[:Metadata]))
|
265
264
|
end
|
266
265
|
callback(:begin_document, [root])
|
267
|
-
walk_pages(@xref.object(root[:Pages])
|
266
|
+
walk_pages(@xref.object(root[:Pages]))
|
268
267
|
callback(:end_document)
|
269
268
|
end
|
270
269
|
################################################################################
|
@@ -280,26 +279,24 @@ class PDF::Reader
|
|
280
279
|
# extract page content
|
281
280
|
if page[:Type] == :Pages
|
282
281
|
callback(:begin_page_container, [page])
|
283
|
-
walk_resources(@xref.object(res)
|
284
|
-
page[:Kids].each {|child| walk_pages(@xref.object(child)
|
282
|
+
walk_resources(@xref.object(res)) if res
|
283
|
+
page[:Kids].each {|child| walk_pages(@xref.object(child))}
|
285
284
|
callback(:end_page_container)
|
286
285
|
elsif page[:Type] == :Page
|
287
286
|
callback(:begin_page, [page])
|
288
|
-
walk_resources(@xref.object(res)
|
287
|
+
walk_resources(@xref.object(res)) if res
|
289
288
|
@page = page
|
290
289
|
@params = []
|
291
290
|
|
292
|
-
if page[:Contents].kind_of?(Array)
|
293
|
-
contents = page[:Contents]
|
294
|
-
elsif @xref.obj_type(page[:Contents]) == :Array
|
295
|
-
contents, stream = @xref.object(page[:Contents])
|
291
|
+
if @xref.object(page[:Contents]).kind_of?(Array)
|
292
|
+
contents = @xref.object(page[:Contents])
|
296
293
|
else
|
297
294
|
contents = [page[:Contents]]
|
298
295
|
end
|
299
296
|
|
300
297
|
contents.each do |content|
|
301
|
-
obj
|
302
|
-
content_stream(
|
298
|
+
obj = @xref.object(content)
|
299
|
+
content_stream(obj)
|
303
300
|
end if page.has_key?(:Contents) and page[:Contents]
|
304
301
|
|
305
302
|
callback(:end_page)
|
@@ -356,42 +353,41 @@ class PDF::Reader
|
|
356
353
|
|
357
354
|
# extract any xobject information
|
358
355
|
if resources[:XObject]
|
359
|
-
@xref.object(resources[:XObject]).
|
360
|
-
|
361
|
-
callback(:resource_xobject, [name, obj, stream])
|
356
|
+
@xref.object(resources[:XObject]).each do |name, val|
|
357
|
+
callback(:resource_xobject, [name, @xref.object(val)])
|
362
358
|
end
|
363
359
|
end
|
364
360
|
|
365
361
|
# extract any extgstate information
|
366
362
|
if resources[:ExtGState]
|
367
|
-
@xref.object(resources[:ExtGState]).
|
368
|
-
callback(:resource_extgstate, [name, @xref.object(val)
|
363
|
+
@xref.object(resources[:ExtGState]).each do |name, val|
|
364
|
+
callback(:resource_extgstate, [name, @xref.object(val)])
|
369
365
|
end
|
370
366
|
end
|
371
367
|
|
372
368
|
# extract any colorspace information
|
373
369
|
if resources[:ColorSpace]
|
374
|
-
@xref.object(resources[:ColorSpace]).
|
375
|
-
callback(:resource_colorspace, [name, @xref.object(val)
|
370
|
+
@xref.object(resources[:ColorSpace]).each do |name, val|
|
371
|
+
callback(:resource_colorspace, [name, @xref.object(val)])
|
376
372
|
end
|
377
373
|
end
|
378
374
|
|
379
375
|
# extract any pattern information
|
380
376
|
if resources[:Pattern]
|
381
|
-
@xref.object(resources[:Pattern]).
|
382
|
-
callback(:resource_pattern, [name, @xref.object(val)
|
377
|
+
@xref.object(resources[:Pattern]).each do |name, val|
|
378
|
+
callback(:resource_pattern, [name, @xref.object(val)])
|
383
379
|
end
|
384
380
|
end
|
385
381
|
|
386
382
|
# extract any font information
|
387
383
|
if resources[:Font]
|
388
|
-
@xref.object(resources[:Font]).
|
389
|
-
desc = @xref.object(desc)
|
384
|
+
@xref.object(resources[:Font]).each do |label, desc|
|
385
|
+
desc = @xref.object(desc)
|
390
386
|
@fonts[label] = PDF::Reader::Font.new
|
391
387
|
@fonts[label].label = label
|
392
388
|
@fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
393
389
|
@fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
394
|
-
@fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding])
|
390
|
+
@fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]))
|
395
391
|
@fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
396
392
|
if desc[:ToUnicode]
|
397
393
|
# this stream is a cmap
|
@@ -409,13 +405,11 @@ class PDF::Reader
|
|
409
405
|
# Convert any PDF::Reader::Resource objects into a real object
|
410
406
|
def resolve_references(obj)
|
411
407
|
case obj
|
408
|
+
when PDF::Reader::Stream then
|
409
|
+
obj.hash = resolve_references(obj.hash)
|
410
|
+
obj
|
412
411
|
when PDF::Reader::Reference then
|
413
|
-
|
414
|
-
if stream
|
415
|
-
stream
|
416
|
-
else
|
417
|
-
resolve_references(obj)
|
418
|
-
end
|
412
|
+
resolve_references(@xref.object(obj))
|
419
413
|
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
420
414
|
when Array then obj.collect { |item| resolve_references(item) }
|
421
415
|
else
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -174,8 +174,8 @@ class PDF::Reader
|
|
174
174
|
obj = parse_token
|
175
175
|
post_obj = parse_token
|
176
176
|
case post_obj
|
177
|
-
when "endobj" then return
|
178
|
-
when "stream" then return
|
177
|
+
when "endobj" then return obj
|
178
|
+
when "stream" then return stream(obj)
|
179
179
|
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
180
180
|
end
|
181
181
|
end
|
@@ -183,7 +183,7 @@ class PDF::Reader
|
|
183
183
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
184
184
|
def stream (dict)
|
185
185
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
186
|
-
data = @buffer.read(@xref.object(dict[:Length])
|
186
|
+
data = @buffer.read(@xref.object(dict[:Length]))
|
187
187
|
|
188
188
|
Error.str_assert(parse_token, "endstream")
|
189
189
|
Error.str_assert(parse_token, "endobj")
|
@@ -200,7 +200,7 @@ class PDF::Reader
|
|
200
200
|
end
|
201
201
|
end
|
202
202
|
|
203
|
-
data
|
203
|
+
PDF::Reader::Stream.new(dict, data)
|
204
204
|
end
|
205
205
|
################################################################################
|
206
206
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that represents a single token from a PDF file.
|
29
|
+
#
|
30
|
+
# Behaves exactly like a Ruby String - it basically exists for convenience.
|
31
|
+
class Stream < String
|
32
|
+
attr_accessor :hash
|
33
|
+
################################################################################
|
34
|
+
# Creates a new token with the specified value
|
35
|
+
def initialize (hash, val)
|
36
|
+
@hash = hash
|
37
|
+
super val
|
38
|
+
end
|
39
|
+
################################################################################
|
40
|
+
end
|
41
|
+
################################################################################
|
42
|
+
end
|
43
|
+
################################################################################
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -61,15 +61,11 @@ class PDF::Reader
|
|
61
61
|
#
|
62
62
|
# If the object is a stream, that is returned as well
|
63
63
|
def object (ref, save_pos = true)
|
64
|
-
return ref
|
64
|
+
return ref unless ref.kind_of?(Reference)
|
65
65
|
pos = @buffer.pos if save_pos
|
66
|
-
obj
|
66
|
+
obj = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
|
67
67
|
@buffer.seek(pos) if save_pos
|
68
|
-
|
69
|
-
return [obj, stream]
|
70
|
-
else
|
71
|
-
return [obj, nil]
|
72
|
-
end
|
68
|
+
return obj
|
73
69
|
end
|
74
70
|
################################################################################
|
75
71
|
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
@@ -112,7 +108,7 @@ class PDF::Reader
|
|
112
108
|
end
|
113
109
|
# returns the type of object a ref points to
|
114
110
|
def obj_type(ref)
|
115
|
-
obj
|
111
|
+
obj = object(ref)
|
116
112
|
obj.class.to_s.to_sym
|
117
113
|
end
|
118
114
|
# returns true if the supplied references points to an object with a stream
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,14 +9,15 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-05-
|
12
|
+
date: 2008-05-20 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
16
16
|
description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
|
17
17
|
email: pjones@pmade.com
|
18
|
-
executables:
|
19
|
-
|
18
|
+
executables:
|
19
|
+
- pdf_text
|
20
|
+
- pdf_list_callbacks
|
20
21
|
extensions: []
|
21
22
|
|
22
23
|
extra_rdoc_files:
|
@@ -41,6 +42,7 @@ files:
|
|
41
42
|
- lib/pdf/reader/register_receiver.rb
|
42
43
|
- lib/pdf/reader/font.rb
|
43
44
|
- lib/pdf/reader/glyphlist.txt
|
45
|
+
- lib/pdf/reader/stream.rb
|
44
46
|
- lib/pdf/reader/parser.rb.rej
|
45
47
|
- lib/pdf/reader.rb
|
46
48
|
- Rakefile
|
@@ -73,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
75
|
requirements: []
|
74
76
|
|
75
77
|
rubyforge_project: pdf-reader
|
76
|
-
rubygems_version: 1.
|
78
|
+
rubygems_version: 1.1.1
|
77
79
|
signing_key:
|
78
80
|
specification_version: 2
|
79
81
|
summary: A library for accessing the content of PDF files
|