pdf-reader 0.6.2 → 0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +11 -0
- data/README +23 -0
- data/Rakefile +1 -1
- data/TODO +13 -12
- data/lib/pdf/reader.rb +27 -7
- data/lib/pdf/reader/buffer.rb +10 -2
- data/lib/pdf/reader/cmap.rb +3 -3
- data/lib/pdf/reader/content.rb +77 -37
- data/lib/pdf/reader/encoding.rb +789 -753
- data/lib/pdf/reader/filter.rb +17 -7
- data/lib/pdf/reader/font.rb +2 -1
- data/lib/pdf/reader/parser.rb +11 -11
- data/lib/pdf/reader/parser.rb.rej +29 -0
- data/lib/pdf/reader/text_receiver.rb +2 -2
- data/lib/pdf/reader/xref.rb +20 -8
- metadata +4 -4
- data/lib/pdf/reader/name.rb +0 -37
data/lib/pdf/reader/filter.rb
CHANGED
@@ -38,9 +38,9 @@ class PDF::Reader
|
|
38
38
|
def initialize (name, options)
|
39
39
|
@options = options
|
40
40
|
|
41
|
-
case name
|
42
|
-
when
|
43
|
-
#else
|
41
|
+
case name.to_sym
|
42
|
+
when :FlateDecode then @filter = :flate
|
43
|
+
#else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
44
44
|
end
|
45
45
|
end
|
46
46
|
################################################################################
|
@@ -56,11 +56,21 @@ class PDF::Reader
|
|
56
56
|
# Decode the specified data with the Zlib compression algorithm
|
57
57
|
def flate (data)
|
58
58
|
begin
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
Zlib::Inflate.new.inflate(data)
|
60
|
+
rescue Zlib::DataError => e
|
61
|
+
# by default, Ruby's Zlib assumes the data it's inflating
|
62
|
+
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
63
|
+
# If that fails, then use an undocumented 'feature' to attempt to inflate
|
64
|
+
# the data as a raw RFC1951 stream.
|
65
|
+
#
|
66
|
+
# See
|
67
|
+
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
68
|
+
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
69
|
+
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
63
70
|
end
|
71
|
+
rescue Exception => e
|
72
|
+
# Oops, there was a problem inflating the stream
|
73
|
+
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
64
74
|
end
|
65
75
|
################################################################################
|
66
76
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -35,7 +35,8 @@ class PDF::Reader
|
|
35
35
|
@@glyphs ||= {}
|
36
36
|
|
37
37
|
if @@glyphs.empty?
|
38
|
-
|
38
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
39
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
39
40
|
f.each do |l|
|
40
41
|
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
41
42
|
@@glyphs[name] = "0x#{code}".hex if name
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -32,7 +32,7 @@ class PDF::Reader
|
|
32
32
|
# Create a new parser around a PDF::Reader::Buffer object
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
-
# xref -
|
35
|
+
# xref - a PDF::Reader::XRef object that represents the document's object offsets
|
36
36
|
def initialize (buffer, xref)
|
37
37
|
@buffer = buffer
|
38
38
|
@xref = xref
|
@@ -47,7 +47,7 @@ class PDF::Reader
|
|
47
47
|
token = @buffer.token
|
48
48
|
|
49
49
|
case token
|
50
|
-
when "/" then return
|
50
|
+
when "/" then return @buffer.token.to_sym
|
51
51
|
when "<<" then return dictionary()
|
52
52
|
when "[" then return array()
|
53
53
|
when "(" then return string()
|
@@ -72,7 +72,7 @@ class PDF::Reader
|
|
72
72
|
loop do
|
73
73
|
key = parse_token
|
74
74
|
break if key.kind_of?(Token) and key == ">>"
|
75
|
-
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(
|
75
|
+
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
76
76
|
|
77
77
|
value = parse_token
|
78
78
|
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
|
@@ -174,28 +174,28 @@ class PDF::Reader
|
|
174
174
|
obj = parse_token
|
175
175
|
post_obj = parse_token
|
176
176
|
case post_obj
|
177
|
-
when "endobj" then return obj
|
178
|
-
when "stream" then return obj, stream(obj)
|
177
|
+
when "endobj" then return [obj,nil]
|
178
|
+
when "stream" then return [obj, stream(obj)]
|
179
179
|
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
|
180
180
|
end
|
181
181
|
end
|
182
182
|
################################################################################
|
183
183
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
184
184
|
def stream (dict)
|
185
|
-
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(
|
186
|
-
data = @buffer.read(@xref.object(dict[
|
185
|
+
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
186
|
+
data = @buffer.read(@xref.object(dict[:Length]).first)
|
187
187
|
|
188
188
|
Error.str_assert(parse_token, "endstream")
|
189
189
|
Error.str_assert(parse_token, "endobj")
|
190
190
|
|
191
|
-
if dict.has_key?(
|
191
|
+
if dict.has_key?(:Filter)
|
192
192
|
options = []
|
193
193
|
|
194
|
-
if dict.has_key?(
|
195
|
-
options = Array(dict[
|
194
|
+
if dict.has_key?(:DecodeParms)
|
195
|
+
options = Array(dict[:DecodeParms])
|
196
196
|
end
|
197
197
|
|
198
|
-
Array(dict[
|
198
|
+
Array(dict[:Filter]).each_with_index do |filter, index|
|
199
199
|
data = Filter.new(filter, options[index]).filter(data)
|
200
200
|
end
|
201
201
|
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
***************
|
2
|
+
*** 173,178 ****
|
3
|
+
|
4
|
+
obj = parse_token
|
5
|
+
post_obj = parse_token
|
6
|
+
case post_obj
|
7
|
+
when "endobj" then return [obj,nil]
|
8
|
+
when "stream" then return [obj, stream(obj)]
|
9
|
+
--- 173,192 ----
|
10
|
+
|
11
|
+
obj = parse_token
|
12
|
+
post_obj = parse_token
|
13
|
+
+
|
14
|
+
+ if obj.class == Array
|
15
|
+
+ newobj = Array.new
|
16
|
+
+ obj.each_index {|idx|
|
17
|
+
+ if obj[idx].class == PDF::Reader::Reference
|
18
|
+
+ xo, xs = @xref.object(obj[idx])
|
19
|
+
+ if xs
|
20
|
+
+ newobj << xs
|
21
|
+
+ end
|
22
|
+
+ end
|
23
|
+
+ }
|
24
|
+
+ return newobj.flatten
|
25
|
+
+ end
|
26
|
+
+
|
27
|
+
case post_obj
|
28
|
+
when "endobj" then return [obj,nil]
|
29
|
+
when "stream" then return [obj, stream(obj)]
|
@@ -217,8 +217,8 @@ class PDF::Reader
|
|
217
217
|
def media_box_check (dict)
|
218
218
|
corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup
|
219
219
|
|
220
|
-
if dict.has_key?(
|
221
|
-
media_box = dict[
|
220
|
+
if dict.has_key?(:MediaBox)
|
221
|
+
media_box = dict[:MediaBox]
|
222
222
|
corners[:urx] = media_box[2] - media_box[0]
|
223
223
|
corners[:ury] = media_box[3] - media_box[1]
|
224
224
|
end
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -45,9 +45,11 @@ class PDF::Reader
|
|
45
45
|
offset ||= @buffer.find_first_xref_offset
|
46
46
|
@buffer.seek(offset)
|
47
47
|
token = @buffer.token
|
48
|
-
|
48
|
+
|
49
49
|
if token == "xref" || token == "ref"
|
50
50
|
load_xref_table
|
51
|
+
elsif token.to_i >= 0 && @buffer.token.to_i >= 0 && @buffer.token == "obj"
|
52
|
+
raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
|
51
53
|
else
|
52
54
|
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
53
55
|
end
|
@@ -59,14 +61,14 @@ class PDF::Reader
|
|
59
61
|
#
|
60
62
|
# If the object is a stream, that is returned as well
|
61
63
|
def object (ref, save_pos = true)
|
62
|
-
return ref unless ref.kind_of?(Reference)
|
64
|
+
return ref, nil unless ref.kind_of?(Reference)
|
63
65
|
pos = @buffer.pos if save_pos
|
64
66
|
obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
|
65
67
|
@buffer.seek(pos) if save_pos
|
66
68
|
if stream
|
67
|
-
return obj, stream
|
69
|
+
return [obj, stream]
|
68
70
|
else
|
69
|
-
return obj
|
71
|
+
return [obj, nil]
|
70
72
|
end
|
71
73
|
end
|
72
74
|
################################################################################
|
@@ -78,7 +80,7 @@ class PDF::Reader
|
|
78
80
|
begin
|
79
81
|
# loop over all subsections of the xref table
|
80
82
|
# In a well formed PDF, the 'trailer' token will indicate
|
81
|
-
# the end of the table. However we need to be careful in case
|
83
|
+
# the end of the table. However we need to be careful in case
|
82
84
|
# we're processing a malformed pdf that is missing the trailer.
|
83
85
|
loop do
|
84
86
|
tok_one, tok_two = @buffer.token, @buffer.token
|
@@ -104,10 +106,20 @@ class PDF::Reader
|
|
104
106
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
|
105
107
|
|
106
108
|
trailer = Parser.new(@buffer, self).dictionary
|
107
|
-
load(trailer[
|
109
|
+
load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
108
110
|
|
109
111
|
trailer
|
110
112
|
end
|
113
|
+
# returns the type of object a ref points to
|
114
|
+
def obj_type(ref)
|
115
|
+
obj, stream = object(ref)
|
116
|
+
obj.class.to_s.to_sym
|
117
|
+
end
|
118
|
+
# returns true if the supplied references points to an object with a stream
|
119
|
+
def stream?(ref)
|
120
|
+
obj, stream = @xref.object(ref)
|
121
|
+
stream ? true : false
|
122
|
+
end
|
111
123
|
################################################################################
|
112
124
|
# returns the byte offset for the specified PDF object.
|
113
125
|
#
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: "0.7"
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-05-06 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -28,9 +28,8 @@ files:
|
|
28
28
|
- lib/pdf/reader
|
29
29
|
- lib/pdf/reader/explore.rb
|
30
30
|
- lib/pdf/reader/reference.rb
|
31
|
-
- lib/pdf/reader/name.rb
|
32
|
-
- lib/pdf/reader/token.rb
|
33
31
|
- lib/pdf/reader/xref.rb
|
32
|
+
- lib/pdf/reader/token.rb
|
34
33
|
- lib/pdf/reader/filter.rb
|
35
34
|
- lib/pdf/reader/text_receiver.rb
|
36
35
|
- lib/pdf/reader/buffer.rb
|
@@ -42,6 +41,7 @@ files:
|
|
42
41
|
- lib/pdf/reader/register_receiver.rb
|
43
42
|
- lib/pdf/reader/font.rb
|
44
43
|
- lib/pdf/reader/glyphlist.txt
|
44
|
+
- lib/pdf/reader/parser.rb.rej
|
45
45
|
- lib/pdf/reader.rb
|
46
46
|
- Rakefile
|
47
47
|
- README
|
data/lib/pdf/reader/name.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
################################################################################
|
2
|
-
#
|
3
|
-
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
-
#
|
5
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
-
# a copy of this software and associated documentation files (the
|
7
|
-
# "Software"), to deal in the Software without restriction, including
|
8
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
-
# the following conditions:
|
12
|
-
#
|
13
|
-
# The above copyright notice and this permission notice shall be
|
14
|
-
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
16
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
-
#
|
24
|
-
################################################################################
|
25
|
-
|
26
|
-
class PDF::Reader
|
27
|
-
################################################################################
|
28
|
-
class Name < String
|
29
|
-
################################################################################
|
30
|
-
def initialize (val)
|
31
|
-
super
|
32
|
-
end
|
33
|
-
################################################################################
|
34
|
-
end
|
35
|
-
################################################################################
|
36
|
-
end
|
37
|
-
################################################################################
|