pdf-reader 0.6.2 → 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +11 -0
- data/README +23 -0
- data/Rakefile +1 -1
- data/TODO +13 -12
- data/lib/pdf/reader.rb +27 -7
- data/lib/pdf/reader/buffer.rb +10 -2
- data/lib/pdf/reader/cmap.rb +3 -3
- data/lib/pdf/reader/content.rb +77 -37
- data/lib/pdf/reader/encoding.rb +789 -753
- data/lib/pdf/reader/filter.rb +17 -7
- data/lib/pdf/reader/font.rb +2 -1
- data/lib/pdf/reader/parser.rb +11 -11
- data/lib/pdf/reader/parser.rb.rej +29 -0
- data/lib/pdf/reader/text_receiver.rb +2 -2
- data/lib/pdf/reader/xref.rb +20 -8
- metadata +4 -4
- data/lib/pdf/reader/name.rb +0 -37
data/lib/pdf/reader/filter.rb
CHANGED
@@ -38,9 +38,9 @@ class PDF::Reader
|
|
38
38
|
def initialize (name, options)
|
39
39
|
@options = options
|
40
40
|
|
41
|
-
case name
|
42
|
-
when
|
43
|
-
#else
|
41
|
+
case name.to_sym
|
42
|
+
when :FlateDecode then @filter = :flate
|
43
|
+
#else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
44
44
|
end
|
45
45
|
end
|
46
46
|
################################################################################
|
@@ -56,11 +56,21 @@ class PDF::Reader
|
|
56
56
|
# Decode the specified data with the Zlib compression algorithm
|
57
57
|
def flate (data)
|
58
58
|
begin
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
Zlib::Inflate.new.inflate(data)
|
60
|
+
rescue Zlib::DataError => e
|
61
|
+
# by default, Ruby's Zlib assumes the data it's inflating
|
62
|
+
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
63
|
+
# If that fails, then use an undocumented 'feature' to attempt to inflate
|
64
|
+
# the data as a raw RFC1951 stream.
|
65
|
+
#
|
66
|
+
# See
|
67
|
+
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
68
|
+
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
69
|
+
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
63
70
|
end
|
71
|
+
rescue Exception => e
|
72
|
+
# Oops, there was a problem inflating the stream
|
73
|
+
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
64
74
|
end
|
65
75
|
################################################################################
|
66
76
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -35,7 +35,8 @@ class PDF::Reader
|
|
35
35
|
@@glyphs ||= {}
|
36
36
|
|
37
37
|
if @@glyphs.empty?
|
38
|
-
|
38
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
39
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
39
40
|
f.each do |l|
|
40
41
|
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
41
42
|
@@glyphs[name] = "0x#{code}".hex if name
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -32,7 +32,7 @@ class PDF::Reader
|
|
32
32
|
# Create a new parser around a PDF::Reader::Buffer object
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
-
# xref -
|
35
|
+
# xref - a PDF::Reader::XRef object that represents the document's object offsets
|
36
36
|
def initialize (buffer, xref)
|
37
37
|
@buffer = buffer
|
38
38
|
@xref = xref
|
@@ -47,7 +47,7 @@ class PDF::Reader
|
|
47
47
|
token = @buffer.token
|
48
48
|
|
49
49
|
case token
|
50
|
-
when "/" then return
|
50
|
+
when "/" then return @buffer.token.to_sym
|
51
51
|
when "<<" then return dictionary()
|
52
52
|
when "[" then return array()
|
53
53
|
when "(" then return string()
|
@@ -72,7 +72,7 @@ class PDF::Reader
|
|
72
72
|
loop do
|
73
73
|
key = parse_token
|
74
74
|
break if key.kind_of?(Token) and key == ">>"
|
75
|
-
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(
|
75
|
+
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
76
76
|
|
77
77
|
value = parse_token
|
78
78
|
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
|
@@ -174,28 +174,28 @@ class PDF::Reader
|
|
174
174
|
obj = parse_token
|
175
175
|
post_obj = parse_token
|
176
176
|
case post_obj
|
177
|
-
when "endobj" then return obj
|
178
|
-
when "stream" then return obj, stream(obj)
|
177
|
+
when "endobj" then return [obj,nil]
|
178
|
+
when "stream" then return [obj, stream(obj)]
|
179
179
|
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
180
180
|
end
|
181
181
|
end
|
182
182
|
################################################################################
|
183
183
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
184
184
|
def stream (dict)
|
185
|
-
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(
|
186
|
-
data = @buffer.read(@xref.object(dict[
|
185
|
+
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
186
|
+
data = @buffer.read(@xref.object(dict[:Length]).first)
|
187
187
|
|
188
188
|
Error.str_assert(parse_token, "endstream")
|
189
189
|
Error.str_assert(parse_token, "endobj")
|
190
190
|
|
191
|
-
if dict.has_key?(
|
191
|
+
if dict.has_key?(:Filter)
|
192
192
|
options = []
|
193
193
|
|
194
|
-
if dict.has_key?(
|
195
|
-
options = Array(dict[
|
194
|
+
if dict.has_key?(:DecodeParms)
|
195
|
+
options = Array(dict[:DecodeParms])
|
196
196
|
end
|
197
197
|
|
198
|
-
Array(dict[
|
198
|
+
Array(dict[:Filter]).each_with_index do |filter, index|
|
199
199
|
data = Filter.new(filter, options[index]).filter(data)
|
200
200
|
end
|
201
201
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
***************
|
2
|
+
*** 173,178 ****
|
3
|
+
|
4
|
+
obj = parse_token
|
5
|
+
post_obj = parse_token
|
6
|
+
case post_obj
|
7
|
+
when "endobj" then return [obj,nil]
|
8
|
+
when "stream" then return [obj, stream(obj)]
|
9
|
+
--- 173,192 ----
|
10
|
+
|
11
|
+
obj = parse_token
|
12
|
+
post_obj = parse_token
|
13
|
+
+
|
14
|
+
+ if obj.class == Array
|
15
|
+
+ newobj = Array.new
|
16
|
+
+ obj.each_index {|idx|
|
17
|
+
+ if obj[idx].class == PDF::Reader::Reference
|
18
|
+
+ xo, xs = @xref.object(obj[idx])
|
19
|
+
+ if xs
|
20
|
+
+ newobj << xs
|
21
|
+
+ end
|
22
|
+
+ end
|
23
|
+
+ }
|
24
|
+
+ return newobj.flatten
|
25
|
+
+ end
|
26
|
+
+
|
27
|
+
case post_obj
|
28
|
+
when "endobj" then return [obj,nil]
|
29
|
+
when "stream" then return [obj, stream(obj)]
|
@@ -217,8 +217,8 @@ class PDF::Reader
|
|
217
217
|
def media_box_check (dict)
|
218
218
|
corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup
|
219
219
|
|
220
|
-
if dict.has_key?(
|
221
|
-
media_box = dict[
|
220
|
+
if dict.has_key?(:MediaBox)
|
221
|
+
media_box = dict[:MediaBox]
|
222
222
|
corners[:urx] = media_box[2] - media_box[0]
|
223
223
|
corners[:ury] = media_box[3] - media_box[1]
|
224
224
|
end
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -45,9 +45,11 @@ class PDF::Reader
|
|
45
45
|
offset ||= @buffer.find_first_xref_offset
|
46
46
|
@buffer.seek(offset)
|
47
47
|
token = @buffer.token
|
48
|
-
|
48
|
+
|
49
49
|
if token == "xref" || token == "ref"
|
50
50
|
load_xref_table
|
51
|
+
elsif token.to_i >= 0 && @buffer.token.to_i >= 0 && @buffer.token == "obj"
|
52
|
+
raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
|
51
53
|
else
|
52
54
|
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
53
55
|
end
|
@@ -59,14 +61,14 @@ class PDF::Reader
|
|
59
61
|
#
|
60
62
|
# If the object is a stream, that is returned as well
|
61
63
|
def object (ref, save_pos = true)
|
62
|
-
return ref unless ref.kind_of?(Reference)
|
64
|
+
return ref, nil unless ref.kind_of?(Reference)
|
63
65
|
pos = @buffer.pos if save_pos
|
64
66
|
obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
|
65
67
|
@buffer.seek(pos) if save_pos
|
66
68
|
if stream
|
67
|
-
return obj, stream
|
69
|
+
return [obj, stream]
|
68
70
|
else
|
69
|
-
return obj
|
71
|
+
return [obj, nil]
|
70
72
|
end
|
71
73
|
end
|
72
74
|
################################################################################
|
@@ -78,7 +80,7 @@ class PDF::Reader
|
|
78
80
|
begin
|
79
81
|
# loop over all subsections of the xref table
|
80
82
|
# In a well formed PDF, the 'trailer' token will indicate
|
81
|
-
# the end of the table. However we need to be careful in case
|
83
|
+
# the end of the table. However we need to be careful in case
|
82
84
|
# we're processing a malformed pdf that is missing the trailer.
|
83
85
|
loop do
|
84
86
|
tok_one, tok_two = @buffer.token, @buffer.token
|
@@ -104,10 +106,20 @@ class PDF::Reader
|
|
104
106
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
|
105
107
|
|
106
108
|
trailer = Parser.new(@buffer, self).dictionary
|
107
|
-
load(trailer[
|
109
|
+
load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
108
110
|
|
109
111
|
trailer
|
110
112
|
end
|
113
|
+
# returns the type of object a ref points to
|
114
|
+
def obj_type(ref)
|
115
|
+
obj, stream = object(ref)
|
116
|
+
obj.class.to_s.to_sym
|
117
|
+
end
|
118
|
+
# returns true if the supplied references points to an object with a stream
|
119
|
+
def stream?(ref)
|
120
|
+
obj, stream = @xref.object(ref)
|
121
|
+
stream ? true : false
|
122
|
+
end
|
111
123
|
################################################################################
|
112
124
|
# returns the byte offset for the specified PDF object.
|
113
125
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: "0.7"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-05-06 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -28,9 +28,8 @@ files:
|
|
28
28
|
- lib/pdf/reader
|
29
29
|
- lib/pdf/reader/explore.rb
|
30
30
|
- lib/pdf/reader/reference.rb
|
31
|
-
- lib/pdf/reader/name.rb
|
32
|
-
- lib/pdf/reader/token.rb
|
33
31
|
- lib/pdf/reader/xref.rb
|
32
|
+
- lib/pdf/reader/token.rb
|
34
33
|
- lib/pdf/reader/filter.rb
|
35
34
|
- lib/pdf/reader/text_receiver.rb
|
36
35
|
- lib/pdf/reader/buffer.rb
|
@@ -42,6 +41,7 @@ files:
|
|
42
41
|
- lib/pdf/reader/register_receiver.rb
|
43
42
|
- lib/pdf/reader/font.rb
|
44
43
|
- lib/pdf/reader/glyphlist.txt
|
44
|
+
- lib/pdf/reader/parser.rb.rej
|
45
45
|
- lib/pdf/reader.rb
|
46
46
|
- Rakefile
|
47
47
|
- README
|
data/lib/pdf/reader/name.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
################################################################################
|
2
|
-
#
|
3
|
-
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
-
#
|
5
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
-
# a copy of this software and associated documentation files (the
|
7
|
-
# "Software"), to deal in the Software without restriction, including
|
8
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
-
# the following conditions:
|
12
|
-
#
|
13
|
-
# The above copyright notice and this permission notice shall be
|
14
|
-
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
16
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
-
#
|
24
|
-
################################################################################
|
25
|
-
|
26
|
-
class PDF::Reader
|
27
|
-
################################################################################
|
28
|
-
class Name < String
|
29
|
-
################################################################################
|
30
|
-
def initialize (val)
|
31
|
-
super
|
32
|
-
end
|
33
|
-
################################################################################
|
34
|
-
end
|
35
|
-
################################################################################
|
36
|
-
end
|
37
|
-
################################################################################
|