fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
|
5
|
+
class Reader
|
6
|
+
|
7
|
+
# A general class for decoding LZW compressed data. LZW can be
|
8
|
+
# used in PDF files to compresses streams, usually for image data sourced
|
9
|
+
# from a TIFF file.
|
10
|
+
#
|
11
|
+
# See the following links for more information:
|
12
|
+
#
|
13
|
+
# ref http://www.fileformat.info/format/tiff/corion-lzw.htm
|
14
|
+
# ref http://marknelson.us/1989/10/01/lzw-data-compression/
|
15
|
+
#
|
16
|
+
# The PDF spec also has some data on the algorithm.
|
17
|
+
#
|
18
|
+
class LZW # :nodoc:
|
19
|
+
|
20
|
+
class BitStream # :nodoc:
|
21
|
+
|
22
|
+
def initialize(data, bits_in_chunk)
|
23
|
+
@data = data
|
24
|
+
@data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
|
25
|
+
@bits_in_chunk = bits_in_chunk
|
26
|
+
@current_pos = 0
|
27
|
+
@bits_left_in_byte = 8
|
28
|
+
end
|
29
|
+
|
30
|
+
def set_bits_in_chunk(bits_in_chunk)
|
31
|
+
@bits_in_chunk = bits_in_chunk
|
32
|
+
end
|
33
|
+
|
34
|
+
def read
|
35
|
+
bits_left_in_chunk = @bits_in_chunk
|
36
|
+
chunk = nil
|
37
|
+
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
38
|
+
chunk = 0 if chunk.nil?
|
39
|
+
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
40
|
+
current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
|
41
|
+
dif = bits_left_in_chunk - @bits_left_in_byte
|
42
|
+
if dif > 0 then current_byte <<= dif
|
43
|
+
elsif dif < 0 then current_byte >>= dif.abs
|
44
|
+
end
|
45
|
+
chunk |= current_byte #add bits to result
|
46
|
+
bits_left_in_chunk = if dif >= 0 then dif else 0 end
|
47
|
+
@bits_left_in_byte = if dif < 0 then dif.abs else 0 end
|
48
|
+
if @bits_left_in_byte.zero? #next byte
|
49
|
+
@current_pos += 1
|
50
|
+
@bits_left_in_byte = 8
|
51
|
+
end
|
52
|
+
end
|
53
|
+
chunk
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
CODE_EOD = 257 #end of data
|
58
|
+
CODE_CLEAR_TABLE = 256 #clear table
|
59
|
+
|
60
|
+
# stores de pairs code => string
|
61
|
+
class StringTable < Hash # :nodoc:
|
62
|
+
attr_reader :string_table_pos
|
63
|
+
|
64
|
+
def initialize
|
65
|
+
super
|
66
|
+
@string_table_pos = 258 #initial code
|
67
|
+
end
|
68
|
+
|
69
|
+
#if code less than 258 return fixed string
|
70
|
+
def [](key)
|
71
|
+
if key > 257 then super else key.chr end
|
72
|
+
end
|
73
|
+
|
74
|
+
def add(string)
|
75
|
+
store(@string_table_pos, string)
|
76
|
+
@string_table_pos += 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Decompresses a LZW compressed string.
|
81
|
+
#
|
82
|
+
def self.decode(data)
|
83
|
+
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
84
|
+
result = ''
|
85
|
+
while not (code = stream.read) == CODE_EOD
|
86
|
+
if code == CODE_CLEAR_TABLE
|
87
|
+
string_table = StringTable.new
|
88
|
+
code = stream.read
|
89
|
+
break if code == CODE_EOD
|
90
|
+
result << string_table[code]
|
91
|
+
old_code = code
|
92
|
+
else
|
93
|
+
string = string_table[code]
|
94
|
+
if string
|
95
|
+
result << string
|
96
|
+
string_table.add create_new_string(string_table, old_code, code)
|
97
|
+
old_code = code
|
98
|
+
else
|
99
|
+
new_string = create_new_string(string_table, old_code, old_code)
|
100
|
+
result << new_string
|
101
|
+
string_table.add new_string
|
102
|
+
old_code = code
|
103
|
+
end
|
104
|
+
#increase de size of the codes when limit reached
|
105
|
+
case string_table.string_table_pos
|
106
|
+
when 511 then stream.set_bits_in_chunk(10)
|
107
|
+
when 1023 then stream.set_bits_in_chunk(11)
|
108
|
+
when 2047 then stream.set_bits_in_chunk(12)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
result
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def self.create_new_string(string_table,some_code, other_code)
|
118
|
+
string_table[some_code] + string_table[other_code][0].chr
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
+
# eventually be removed
|
7
|
+
#
|
8
|
+
class MetadataStrategy < AbstractStrategy # :nodoc:
|
9
|
+
|
10
|
+
def self.to_sym
|
11
|
+
:metadata
|
12
|
+
end
|
13
|
+
|
14
|
+
def process
|
15
|
+
return false unless options[:metadata]
|
16
|
+
|
17
|
+
# may be useful to some people
|
18
|
+
callback(:pdf_version, ohash.pdf_version)
|
19
|
+
|
20
|
+
# ye olde metadata
|
21
|
+
callback(:metadata, [decoded_info]) if info?
|
22
|
+
|
23
|
+
# new style xml metadata
|
24
|
+
callback(:xml_metadata, [xml_metadata]) if xml_metadata?
|
25
|
+
|
26
|
+
# page count
|
27
|
+
if pages?
|
28
|
+
count = ohash.object(pages[:Count])
|
29
|
+
callback(:page_count, count.to_i)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def xml_metadata
|
36
|
+
return @xml_metadata if defined?(@xml_metadata)
|
37
|
+
|
38
|
+
if root[:Metadata].nil?
|
39
|
+
@xml_metadata = nil
|
40
|
+
else
|
41
|
+
string = ohash.object(root[:Metadata]).unfiltered_data
|
42
|
+
string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
|
43
|
+
@xml_metadata = string
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def xml_metadata?
|
48
|
+
xml_metadata ? true : false
|
49
|
+
end
|
50
|
+
|
51
|
+
def decoded_info
|
52
|
+
@decoded_info ||= decode_strings(info)
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# A Hash-like object for caching commonly used objects from a PDF file.
|
6
|
+
#
|
7
|
+
# This is an internal class used by PDF::Reader::ObjectHash
|
8
|
+
#
|
9
|
+
class ObjectCache # nodoc
|
10
|
+
|
11
|
+
# These object types use little memory and are accessed a heap of times as
|
12
|
+
# part of random page access, so we'll cache the unmarshalled objects and
|
13
|
+
# avoid lots of repetitive (and expensive) tokenising
|
14
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@objects = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def [](key)
|
21
|
+
@objects[key]
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(key, value)
|
25
|
+
@objects[key] = value if cacheable?(value)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, local_default = nil)
|
29
|
+
@objects.fetch(key, local_default)
|
30
|
+
end
|
31
|
+
|
32
|
+
def each(&block)
|
33
|
+
@objects.each(&block)
|
34
|
+
end
|
35
|
+
alias :each_pair :each
|
36
|
+
|
37
|
+
def each_key(&block)
|
38
|
+
@objects.each_key(&block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_value(&block)
|
42
|
+
@objects.each_value(&block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def size
|
46
|
+
@objects.size
|
47
|
+
end
|
48
|
+
alias :length :size
|
49
|
+
|
50
|
+
def empty?
|
51
|
+
@objects.empty?
|
52
|
+
end
|
53
|
+
|
54
|
+
def has_key?(key)
|
55
|
+
@objects.has_key?(key)
|
56
|
+
end
|
57
|
+
alias :include? :has_key?
|
58
|
+
alias :key? :has_key?
|
59
|
+
alias :member? :has_key?
|
60
|
+
|
61
|
+
def has_value?(value)
|
62
|
+
@objects.has_value?(value)
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_s
|
66
|
+
"<PDF::Reader::ObjectCache size: #{self.size}>"
|
67
|
+
end
|
68
|
+
|
69
|
+
def keys
|
70
|
+
@objects.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
def values
|
74
|
+
@objects.values
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def cacheable?(obj)
|
80
|
+
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# Provides low level access to the objects in a PDF file via a hash-like
|
5
|
+
# object.
|
6
|
+
#
|
7
|
+
# A PDF file can be viewed as a large hash map. It is a series of objects
|
8
|
+
# stored at precise byte offsets, and a table that maps object IDs to byte
|
9
|
+
# offsets. Given an object ID, looking up an object is an O(1) operation.
|
10
|
+
#
|
11
|
+
# Each PDF object can be mapped to a ruby object, so by passing an object
|
12
|
+
# ID to the [] method, a ruby representation of that object will be
|
13
|
+
# retrieved.
|
14
|
+
#
|
15
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
16
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
17
|
+
# is read only.
|
18
|
+
#
|
19
|
+
# == Basic Usage
|
20
|
+
#
|
21
|
+
# h = PDF::Reader::ObjectHash.new("somefile.pdf")
|
22
|
+
# h[1]
|
23
|
+
# => 3469
|
24
|
+
#
|
25
|
+
# h[PDF::Reader::Reference.new(1,0)]
|
26
|
+
# => 3469
|
27
|
+
#
|
28
|
+
class ObjectHash
|
29
|
+
include Enumerable
|
30
|
+
|
31
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
32
|
+
|
33
|
+
attr_accessor :default
|
34
|
+
attr_reader :trailer, :pdf_version
|
35
|
+
|
36
|
+
# Creates a new ObjectHash object. input can be a string with a valid filename,
|
37
|
+
# a string containing a PDF file, or an IO object.
|
38
|
+
#
|
39
|
+
def initialize(input)
|
40
|
+
if input.respond_to?(:seek) && input.respond_to?(:read)
|
41
|
+
@io = input
|
42
|
+
elsif File.file?(input.to_s)
|
43
|
+
if File.respond_to?(:binread)
|
44
|
+
input = File.binread(input.to_s)
|
45
|
+
else
|
46
|
+
input = File.read(input.to_s)
|
47
|
+
end
|
48
|
+
@io = StringIO.new(input)
|
49
|
+
else
|
50
|
+
raise ArgumentError, "input must be an IO-like object or a filename"
|
51
|
+
end
|
52
|
+
@pdf_version = read_version
|
53
|
+
@xref = PDF::Reader::XRef.new(@io)
|
54
|
+
@trailer = @xref.trailer
|
55
|
+
@cache = PDF::Reader::ObjectCache.new
|
56
|
+
|
57
|
+
if trailer[:Encrypt]
|
58
|
+
raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# returns the type of object a ref points to
|
63
|
+
def obj_type(ref)
|
64
|
+
self[ref].class.to_s.to_sym
|
65
|
+
rescue
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
# returns true if the supplied references points to an object with a stream
|
70
|
+
def stream?(ref)
|
71
|
+
self[ref].class == PDF::Reader::Stream
|
72
|
+
rescue
|
73
|
+
false
|
74
|
+
end
|
75
|
+
|
76
|
+
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
77
|
+
# object.
|
78
|
+
#
|
79
|
+
# If an int is used, the object with that ID and a generation number of 0 will
|
80
|
+
# be returned.
|
81
|
+
#
|
82
|
+
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
83
|
+
# can be specified.
|
84
|
+
#
|
85
|
+
def [](key)
|
86
|
+
return default if key.to_i <= 0
|
87
|
+
begin
|
88
|
+
unless key.kind_of?(PDF::Reader::Reference)
|
89
|
+
key = PDF::Reader::Reference.new(key.to_i, 0)
|
90
|
+
end
|
91
|
+
if @cache.has_key?(key)
|
92
|
+
@cache[key]
|
93
|
+
elsif xref[key].is_a?(Fixnum)
|
94
|
+
buf = new_buffer(xref[key])
|
95
|
+
@cache[key] = Parser.new(buf, self).object(key.id, key.gen)
|
96
|
+
elsif xref[key].is_a?(PDF::Reader::Reference)
|
97
|
+
container_key = xref[key]
|
98
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
99
|
+
@cache[key] = object_streams[container_key][key.id]
|
100
|
+
end
|
101
|
+
rescue InvalidObjectError
|
102
|
+
return default
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def cacheable?(obj)
|
107
|
+
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
108
|
+
end
|
109
|
+
|
110
|
+
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
111
|
+
# object in the PDF and return it. Otherwise return key untouched.
|
112
|
+
#
|
113
|
+
def object(key)
|
114
|
+
key.is_a?(PDF::Reader::Reference) ? self[key] : key
|
115
|
+
end
|
116
|
+
alias :deref :object
|
117
|
+
|
118
|
+
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
119
|
+
# object.
|
120
|
+
#
|
121
|
+
# If an int is used, the object with that ID and a generation number of 0 will
|
122
|
+
# be returned.
|
123
|
+
#
|
124
|
+
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
125
|
+
# can be specified.
|
126
|
+
#
|
127
|
+
# local_default is the object that will be returned if the requested key doesn't
|
128
|
+
# exist.
|
129
|
+
#
|
130
|
+
def fetch(key, local_default = nil)
|
131
|
+
obj = self[key]
|
132
|
+
if obj
|
133
|
+
return obj
|
134
|
+
elsif local_default
|
135
|
+
return local_default
|
136
|
+
else
|
137
|
+
raise IndexError, "#{key} is invalid" if key.to_i <= 0
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# iterate over each key, value. Just like a ruby hash.
|
142
|
+
#
|
143
|
+
def each(&block)
|
144
|
+
@xref.each do |ref|
|
145
|
+
yield ref, self[ref]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
alias :each_pair :each
|
149
|
+
|
150
|
+
# iterate over each key. Just like a ruby hash.
|
151
|
+
#
|
152
|
+
def each_key(&block)
|
153
|
+
each do |id, obj|
|
154
|
+
yield id
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# iterate over each value. Just like a ruby hash.
|
159
|
+
#
|
160
|
+
def each_value(&block)
|
161
|
+
each do |id, obj|
|
162
|
+
yield obj
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# return the number of objects in the file. An object with multiple generations
|
167
|
+
# is counted once.
|
168
|
+
def size
|
169
|
+
xref.size
|
170
|
+
end
|
171
|
+
alias :length :size
|
172
|
+
|
173
|
+
# return true if there are no objects in this file
|
174
|
+
#
|
175
|
+
def empty?
|
176
|
+
size == 0 ? true : false
|
177
|
+
end
|
178
|
+
|
179
|
+
# return true if the specified key exists in the file. key
|
180
|
+
# can be an int or a PDF::Reader::Reference
|
181
|
+
#
|
182
|
+
def has_key?(check_key)
|
183
|
+
# TODO update from O(n) to O(1)
|
184
|
+
each_key do |key|
|
185
|
+
if check_key.kind_of?(PDF::Reader::Reference)
|
186
|
+
return true if check_key == key
|
187
|
+
else
|
188
|
+
return true if check_key.to_i == key.id
|
189
|
+
end
|
190
|
+
end
|
191
|
+
return false
|
192
|
+
end
|
193
|
+
alias :include? :has_key?
|
194
|
+
alias :key? :has_key?
|
195
|
+
alias :member? :has_key?
|
196
|
+
|
197
|
+
# return true if the specifiedvalue exists in the file
|
198
|
+
#
|
199
|
+
def has_value?(value)
|
200
|
+
# TODO update from O(n) to O(1)
|
201
|
+
each_value do |obj|
|
202
|
+
return true if obj == value
|
203
|
+
end
|
204
|
+
return false
|
205
|
+
end
|
206
|
+
alias :value? :has_key?
|
207
|
+
|
208
|
+
def to_s
|
209
|
+
"<PDF::Reader::ObjectHash size: #{self.size}>"
|
210
|
+
end
|
211
|
+
|
212
|
+
# return an array of all keys in the file
|
213
|
+
#
|
214
|
+
def keys
|
215
|
+
ret = []
|
216
|
+
each_key { |k| ret << k }
|
217
|
+
ret
|
218
|
+
end
|
219
|
+
|
220
|
+
# return an array of all values in the file
|
221
|
+
#
|
222
|
+
def values
|
223
|
+
ret = []
|
224
|
+
each_value { |v| ret << v }
|
225
|
+
ret
|
226
|
+
end
|
227
|
+
|
228
|
+
# return an array of all values from the specified keys
|
229
|
+
#
|
230
|
+
def values_at(*ids)
|
231
|
+
ids.map { |id| self[id] }
|
232
|
+
end
|
233
|
+
|
234
|
+
# return an array of arrays. Each sub array contains a key/value pair.
|
235
|
+
#
|
236
|
+
def to_a
|
237
|
+
ret = []
|
238
|
+
each do |id, obj|
|
239
|
+
ret << [id, obj]
|
240
|
+
end
|
241
|
+
ret
|
242
|
+
end
|
243
|
+
|
244
|
+
# returns an array of PDF::Reader::References. Each reference in the
|
245
|
+
# array points a Page object, one for each page in the PDF. The first
|
246
|
+
# reference is page 1, second reference is page 2, etc.
|
247
|
+
#
|
248
|
+
# Useful for apps that want to extract data from specific pages.
|
249
|
+
#
|
250
|
+
def page_references
|
251
|
+
root = fetch(trailer[:Root])
|
252
|
+
@page_references ||= get_page_objects(root[:Pages]).flatten
|
253
|
+
end
|
254
|
+
|
255
|
+
private
|
256
|
+
|
257
|
+
def new_buffer(offset = 0)
|
258
|
+
PDF::Reader::Buffer.new(@io, :seek => offset)
|
259
|
+
end
|
260
|
+
|
261
|
+
def xref
|
262
|
+
@xref
|
263
|
+
end
|
264
|
+
|
265
|
+
def object_streams
|
266
|
+
@object_stream ||= {}
|
267
|
+
end
|
268
|
+
|
269
|
+
# returns a nested array of object references for all pages in this object store.
|
270
|
+
#
|
271
|
+
def get_page_objects(ref)
|
272
|
+
obj = fetch(ref)
|
273
|
+
|
274
|
+
if obj[:Type] == :Page
|
275
|
+
ref
|
276
|
+
elsif obj[:Type] == :Pages
|
277
|
+
obj[:Kids].map { |kid| get_page_objects(kid) }
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
def read_version
|
282
|
+
@io.seek(0)
|
283
|
+
m, version = *@io.read(10).match(/PDF-(\d.\d)/)
|
284
|
+
@io.seek(0)
|
285
|
+
version.to_f
|
286
|
+
end
|
287
|
+
|
288
|
+
end
|
289
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# provides a wrapper around a PDF stream object that contains other objects in it.
|
6
|
+
# This is done for added compression and is described as an "Object Stream" in the spec.
|
7
|
+
#
|
8
|
+
class ObjectStream # :nodoc:
|
9
|
+
def initialize(stream)
|
10
|
+
@dict = stream.hash
|
11
|
+
@data = stream.unfiltered_data
|
12
|
+
end
|
13
|
+
|
14
|
+
def [](objid)
|
15
|
+
if offsets[objid].nil?
|
16
|
+
nil
|
17
|
+
else
|
18
|
+
buf = PDF::Reader::Buffer.new(StringIO.new(@data), :seek => offsets[objid])
|
19
|
+
parser = PDF::Reader::Parser.new(buf)
|
20
|
+
parser.parse_token
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def size
|
25
|
+
@dict[:N]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def offsets
|
31
|
+
@offsets ||= {}
|
32
|
+
return @offsets if @offsets.keys.size > 0
|
33
|
+
|
34
|
+
size.times do
|
35
|
+
@offsets[buffer.token.to_i] = first + buffer.token.to_i
|
36
|
+
end
|
37
|
+
@offsets
|
38
|
+
end
|
39
|
+
|
40
|
+
def first
|
41
|
+
@dict[:First]
|
42
|
+
end
|
43
|
+
|
44
|
+
def buffer
|
45
|
+
@buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|