pdf-reader 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +7 -0
- data/README +6 -3
- data/Rakefile +4 -1
- data/bin/pdf_list_callbacks +17 -0
- data/bin/pdf_text +40 -0
- data/lib/pdf/reader.rb +4 -2
- data/lib/pdf/reader/buffer.rb +2 -1
- data/lib/pdf/reader/content.rb +24 -30
- data/lib/pdf/reader/parser.rb +4 -4
- data/lib/pdf/reader/stream.rb +43 -0
- data/lib/pdf/reader/xref.rb +4 -8
- metadata +7 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v0.7.2 (20th May 2008)
|
2
|
+
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
3
|
+
- Correctly handle page content instruction sets with trailing whitespace
|
4
|
+
- Represent PDF Streams with a new object, PDF::Reader::Stream
|
5
|
+
- their really wasn't any point in separating the stream content from it's associated dict. You need both
|
6
|
+
parts to correctly interpret the content
|
7
|
+
|
1
8
|
v0.7.1 (6th May 2008)
|
2
9
|
- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
|
3
10
|
- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
|
data/README
CHANGED
@@ -48,6 +48,9 @@ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't curren
|
|
48
48
|
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
49
49
|
us with future code improvements.
|
50
50
|
|
51
|
+
Any other exceptions should be considered bugs and should be reported (unless they originate
|
52
|
+
inside your receiver, in which case you're on your own)
|
53
|
+
|
51
54
|
= Maintainers
|
52
55
|
|
53
56
|
- Peter Jones <mailto:pjones@pmade.com>
|
@@ -229,9 +232,9 @@ layout of the file, not the order objects are displayed to the user. As a
|
|
229
232
|
consequence of this it is highly unlikely that text will be completely in
|
230
233
|
order.
|
231
234
|
|
232
|
-
Occasionally some text cannot be extracted properly due to the way it has been
|
233
|
-
of invalid bytes. In these cases PDF::Reader will output a
|
234
|
-
an unrecognisable character.
|
235
|
+
Occasionally some text cannot be extracted properly due to the way it has been
|
236
|
+
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
237
|
+
little UTF-8 friendly box to indicate an unrecognisable character.
|
235
238
|
|
236
239
|
= Resources
|
237
240
|
|
data/Rakefile
CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
|
|
6
6
|
require "rake/gempackagetask"
|
7
7
|
require 'spec/rake/spectask'
|
8
8
|
|
9
|
-
PKG_VERSION = "0.7.
|
9
|
+
PKG_VERSION = "0.7.2"
|
10
10
|
PKG_NAME = "pdf-reader"
|
11
11
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
12
|
|
@@ -65,6 +65,9 @@ spec = Gem::Specification.new do |spec|
|
|
65
65
|
["Rakefile"]
|
66
66
|
|
67
67
|
spec.require_path = "lib"
|
68
|
+
spec.bindir = "bin"
|
69
|
+
spec.executables << "pdf_text"
|
70
|
+
spec.executables << "pdf_list_callbacks"
|
68
71
|
spec.has_rdoc = true
|
69
72
|
spec.extra_rdoc_files = %w{README TODO CHANGELOG}
|
70
73
|
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
4
|
+
|
5
|
+
require 'pdf/reader'
|
6
|
+
|
7
|
+
receiver = PDF::Reader::RegisterReceiver.new
|
8
|
+
|
9
|
+
if ARGV.empty?
|
10
|
+
PDF::Reader.new.parse($stdin, receiver)
|
11
|
+
else
|
12
|
+
PDF::Reader.file(ARGV[0], receiver)
|
13
|
+
end
|
14
|
+
|
15
|
+
receiver.callbacks.each do |callback|
|
16
|
+
puts "#{callback[:name]} - #{callback[:args].inspect}"
|
17
|
+
end
|
data/bin/pdf_text
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
4
|
+
|
5
|
+
require 'pdf/reader'
|
6
|
+
|
7
|
+
class PageTextReceiver
|
8
|
+
attr_accessor :content
|
9
|
+
|
10
|
+
# Called when page parsing starts
|
11
|
+
def end_page(arg = nil)
|
12
|
+
if @content
|
13
|
+
puts @content
|
14
|
+
puts
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def show_text(string, *params)
|
19
|
+
@content = "" if @content.nil?
|
20
|
+
@content << string
|
21
|
+
end
|
22
|
+
|
23
|
+
# there's a few text callbacks, so make sure we process them all
|
24
|
+
alias :super_show_text :show_text
|
25
|
+
alias :move_to_next_line_and_show_text :show_text
|
26
|
+
alias :set_spacing_next_line_show_text :show_text
|
27
|
+
|
28
|
+
def show_text_with_positioning(*params)
|
29
|
+
params = params.first
|
30
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
receiver = PageTextReceiver.new
|
35
|
+
|
36
|
+
if ARGV.empty?
|
37
|
+
PDF::Reader.new.parse($stdin, receiver)
|
38
|
+
else
|
39
|
+
PDF::Reader.file(ARGV[0], receiver)
|
40
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -98,6 +98,7 @@ require 'pdf/reader/font'
|
|
98
98
|
require 'pdf/reader/parser'
|
99
99
|
require 'pdf/reader/reference'
|
100
100
|
require 'pdf/reader/register_receiver'
|
101
|
+
require 'pdf/reader/stream'
|
101
102
|
require 'pdf/reader/text_receiver'
|
102
103
|
require 'pdf/reader/token'
|
103
104
|
require 'pdf/reader/xref'
|
@@ -119,8 +120,9 @@ class PDF::Reader
|
|
119
120
|
options.merge!(opts)
|
120
121
|
|
121
122
|
trailer = @xref.load
|
122
|
-
|
123
|
-
@content.
|
123
|
+
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
|
124
|
+
@content.metadata(@xref.object(trailer[:Info])) if options[:metadata]
|
125
|
+
@content.document(@xref.object(trailer[:Root])) if options[:pages]
|
124
126
|
self
|
125
127
|
end
|
126
128
|
################################################################################
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -77,6 +77,7 @@ class PDF::Reader
|
|
77
77
|
# returns true if the underlying IO object is at end and the internal buffer
|
78
78
|
# is empty
|
79
79
|
def eof?
|
80
|
+
ready_token
|
80
81
|
if @buffer
|
81
82
|
@buffer.empty? && @io.eof?
|
82
83
|
else
|
@@ -91,7 +92,7 @@ class PDF::Reader
|
|
91
92
|
# PDF files are processed by tokenising the content into a series of objects and commands.
|
92
93
|
# This prepares the buffer for use by reading the next line of tokens into memory.
|
93
94
|
def ready_token (with_strip=true, skip_blanks=true)
|
94
|
-
while @buffer.nil? or @buffer.empty?
|
95
|
+
while (@buffer.nil? or @buffer.empty?) && !@io.eof?
|
95
96
|
@buffer = @io.readline
|
96
97
|
@buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
|
97
98
|
#@buffer.sub!(/%.*$/, '') if strip_comments
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -260,11 +260,10 @@ class PDF::Reader
|
|
260
260
|
# Begin processing the document
|
261
261
|
def document (root)
|
262
262
|
if root[:Metadata]
|
263
|
-
|
264
|
-
callback(:xml_metadata,stream)
|
263
|
+
callback(:xml_metadata,@xref.object(root[:Metadata]))
|
265
264
|
end
|
266
265
|
callback(:begin_document, [root])
|
267
|
-
walk_pages(@xref.object(root[:Pages])
|
266
|
+
walk_pages(@xref.object(root[:Pages]))
|
268
267
|
callback(:end_document)
|
269
268
|
end
|
270
269
|
################################################################################
|
@@ -280,26 +279,24 @@ class PDF::Reader
|
|
280
279
|
# extract page content
|
281
280
|
if page[:Type] == :Pages
|
282
281
|
callback(:begin_page_container, [page])
|
283
|
-
walk_resources(@xref.object(res)
|
284
|
-
page[:Kids].each {|child| walk_pages(@xref.object(child)
|
282
|
+
walk_resources(@xref.object(res)) if res
|
283
|
+
page[:Kids].each {|child| walk_pages(@xref.object(child))}
|
285
284
|
callback(:end_page_container)
|
286
285
|
elsif page[:Type] == :Page
|
287
286
|
callback(:begin_page, [page])
|
288
|
-
walk_resources(@xref.object(res)
|
287
|
+
walk_resources(@xref.object(res)) if res
|
289
288
|
@page = page
|
290
289
|
@params = []
|
291
290
|
|
292
|
-
if page[:Contents].kind_of?(Array)
|
293
|
-
contents = page[:Contents]
|
294
|
-
elsif @xref.obj_type(page[:Contents]) == :Array
|
295
|
-
contents, stream = @xref.object(page[:Contents])
|
291
|
+
if @xref.object(page[:Contents]).kind_of?(Array)
|
292
|
+
contents = @xref.object(page[:Contents])
|
296
293
|
else
|
297
294
|
contents = [page[:Contents]]
|
298
295
|
end
|
299
296
|
|
300
297
|
contents.each do |content|
|
301
|
-
obj
|
302
|
-
content_stream(
|
298
|
+
obj = @xref.object(content)
|
299
|
+
content_stream(obj)
|
303
300
|
end if page.has_key?(:Contents) and page[:Contents]
|
304
301
|
|
305
302
|
callback(:end_page)
|
@@ -356,42 +353,41 @@ class PDF::Reader
|
|
356
353
|
|
357
354
|
# extract any xobject information
|
358
355
|
if resources[:XObject]
|
359
|
-
@xref.object(resources[:XObject]).
|
360
|
-
|
361
|
-
callback(:resource_xobject, [name, obj, stream])
|
356
|
+
@xref.object(resources[:XObject]).each do |name, val|
|
357
|
+
callback(:resource_xobject, [name, @xref.object(val)])
|
362
358
|
end
|
363
359
|
end
|
364
360
|
|
365
361
|
# extract any extgstate information
|
366
362
|
if resources[:ExtGState]
|
367
|
-
@xref.object(resources[:ExtGState]).
|
368
|
-
callback(:resource_extgstate, [name, @xref.object(val)
|
363
|
+
@xref.object(resources[:ExtGState]).each do |name, val|
|
364
|
+
callback(:resource_extgstate, [name, @xref.object(val)])
|
369
365
|
end
|
370
366
|
end
|
371
367
|
|
372
368
|
# extract any colorspace information
|
373
369
|
if resources[:ColorSpace]
|
374
|
-
@xref.object(resources[:ColorSpace]).
|
375
|
-
callback(:resource_colorspace, [name, @xref.object(val)
|
370
|
+
@xref.object(resources[:ColorSpace]).each do |name, val|
|
371
|
+
callback(:resource_colorspace, [name, @xref.object(val)])
|
376
372
|
end
|
377
373
|
end
|
378
374
|
|
379
375
|
# extract any pattern information
|
380
376
|
if resources[:Pattern]
|
381
|
-
@xref.object(resources[:Pattern]).
|
382
|
-
callback(:resource_pattern, [name, @xref.object(val)
|
377
|
+
@xref.object(resources[:Pattern]).each do |name, val|
|
378
|
+
callback(:resource_pattern, [name, @xref.object(val)])
|
383
379
|
end
|
384
380
|
end
|
385
381
|
|
386
382
|
# extract any font information
|
387
383
|
if resources[:Font]
|
388
|
-
@xref.object(resources[:Font]).
|
389
|
-
desc = @xref.object(desc)
|
384
|
+
@xref.object(resources[:Font]).each do |label, desc|
|
385
|
+
desc = @xref.object(desc)
|
390
386
|
@fonts[label] = PDF::Reader::Font.new
|
391
387
|
@fonts[label].label = label
|
392
388
|
@fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
393
389
|
@fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
394
|
-
@fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding])
|
390
|
+
@fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]))
|
395
391
|
@fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
396
392
|
if desc[:ToUnicode]
|
397
393
|
# this stream is a cmap
|
@@ -409,13 +405,11 @@ class PDF::Reader
|
|
409
405
|
# Convert any PDF::Reader::Resource objects into a real object
|
410
406
|
def resolve_references(obj)
|
411
407
|
case obj
|
408
|
+
when PDF::Reader::Stream then
|
409
|
+
obj.hash = resolve_references(obj.hash)
|
410
|
+
obj
|
412
411
|
when PDF::Reader::Reference then
|
413
|
-
|
414
|
-
if stream
|
415
|
-
stream
|
416
|
-
else
|
417
|
-
resolve_references(obj)
|
418
|
-
end
|
412
|
+
resolve_references(@xref.object(obj))
|
419
413
|
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
420
414
|
when Array then obj.collect { |item| resolve_references(item) }
|
421
415
|
else
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -174,8 +174,8 @@ class PDF::Reader
|
|
174
174
|
obj = parse_token
|
175
175
|
post_obj = parse_token
|
176
176
|
case post_obj
|
177
|
-
when "endobj" then return
|
178
|
-
when "stream" then return
|
177
|
+
when "endobj" then return obj
|
178
|
+
when "stream" then return stream(obj)
|
179
179
|
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
180
180
|
end
|
181
181
|
end
|
@@ -183,7 +183,7 @@ class PDF::Reader
|
|
183
183
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
184
184
|
def stream (dict)
|
185
185
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
186
|
-
data = @buffer.read(@xref.object(dict[:Length])
|
186
|
+
data = @buffer.read(@xref.object(dict[:Length]))
|
187
187
|
|
188
188
|
Error.str_assert(parse_token, "endstream")
|
189
189
|
Error.str_assert(parse_token, "endobj")
|
@@ -200,7 +200,7 @@ class PDF::Reader
|
|
200
200
|
end
|
201
201
|
end
|
202
202
|
|
203
|
-
data
|
203
|
+
PDF::Reader::Stream.new(dict, data)
|
204
204
|
end
|
205
205
|
################################################################################
|
206
206
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that represents a single token from a PDF file.
|
29
|
+
#
|
30
|
+
# Behaves exactly like a Ruby String - it basically exists for convenience.
|
31
|
+
class Stream < String
|
32
|
+
attr_accessor :hash
|
33
|
+
################################################################################
|
34
|
+
# Creates a new token with the specified value
|
35
|
+
def initialize (hash, val)
|
36
|
+
@hash = hash
|
37
|
+
super val
|
38
|
+
end
|
39
|
+
################################################################################
|
40
|
+
end
|
41
|
+
################################################################################
|
42
|
+
end
|
43
|
+
################################################################################
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -61,15 +61,11 @@ class PDF::Reader
|
|
61
61
|
#
|
62
62
|
# If the object is a stream, that is returned as well
|
63
63
|
def object (ref, save_pos = true)
|
64
|
-
return ref
|
64
|
+
return ref unless ref.kind_of?(Reference)
|
65
65
|
pos = @buffer.pos if save_pos
|
66
|
-
obj
|
66
|
+
obj = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
|
67
67
|
@buffer.seek(pos) if save_pos
|
68
|
-
|
69
|
-
return [obj, stream]
|
70
|
-
else
|
71
|
-
return [obj, nil]
|
72
|
-
end
|
68
|
+
return obj
|
73
69
|
end
|
74
70
|
################################################################################
|
75
71
|
# Assumes the underlying buffer is positioned at the start of an Xref table and
|
@@ -112,7 +108,7 @@ class PDF::Reader
|
|
112
108
|
end
|
113
109
|
# returns the type of object a ref points to
|
114
110
|
def obj_type(ref)
|
115
|
-
obj
|
111
|
+
obj = object(ref)
|
116
112
|
obj.class.to_s.to_sym
|
117
113
|
end
|
118
114
|
# returns true if the supplied references points to an object with a stream
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,14 +9,15 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-05-
|
12
|
+
date: 2008-05-20 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
16
16
|
description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
|
17
17
|
email: pjones@pmade.com
|
18
|
-
executables:
|
19
|
-
|
18
|
+
executables:
|
19
|
+
- pdf_text
|
20
|
+
- pdf_list_callbacks
|
20
21
|
extensions: []
|
21
22
|
|
22
23
|
extra_rdoc_files:
|
@@ -41,6 +42,7 @@ files:
|
|
41
42
|
- lib/pdf/reader/register_receiver.rb
|
42
43
|
- lib/pdf/reader/font.rb
|
43
44
|
- lib/pdf/reader/glyphlist.txt
|
45
|
+
- lib/pdf/reader/stream.rb
|
44
46
|
- lib/pdf/reader/parser.rb.rej
|
45
47
|
- lib/pdf/reader.rb
|
46
48
|
- Rakefile
|
@@ -73,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
73
75
|
requirements: []
|
74
76
|
|
75
77
|
rubyforge_project: pdf-reader
|
76
|
-
rubygems_version: 1.
|
78
|
+
rubygems_version: 1.1.1
|
77
79
|
signing_key:
|
78
80
|
specification_version: 2
|
79
81
|
summary: A library for accessing the content of PDF files
|