fireinc-pdf-reader 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,123 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+
5
+ class Reader
6
+
7
+ # A general class for decoding LZW compressed data. LZW can be
8
+ # used in PDF files to compresses streams, usually for image data sourced
9
+ # from a TIFF file.
10
+ #
11
+ # See the following links for more information:
12
+ #
13
+ # ref http://www.fileformat.info/format/tiff/corion-lzw.htm
14
+ # ref http://marknelson.us/1989/10/01/lzw-data-compression/
15
+ #
16
+ # The PDF spec also has some data on the algorithm.
17
+ #
18
+ class LZW # :nodoc:
19
+
20
+ class BitStream # :nodoc:
21
+
22
+ def initialize(data, bits_in_chunk)
23
+ @data = data
24
+ @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
25
+ @bits_in_chunk = bits_in_chunk
26
+ @current_pos = 0
27
+ @bits_left_in_byte = 8
28
+ end
29
+
30
+ def set_bits_in_chunk(bits_in_chunk)
31
+ @bits_in_chunk = bits_in_chunk
32
+ end
33
+
34
+ def read
35
+ bits_left_in_chunk = @bits_in_chunk
36
+ chunk = nil
37
+ while bits_left_in_chunk > 0 and @current_pos < @data.size
38
+ chunk = 0 if chunk.nil?
39
+ codepoint = @data[@current_pos, 1].unpack("C*")[0]
40
+ current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
41
+ dif = bits_left_in_chunk - @bits_left_in_byte
42
+ if dif > 0 then current_byte <<= dif
43
+ elsif dif < 0 then current_byte >>= dif.abs
44
+ end
45
+ chunk |= current_byte #add bits to result
46
+ bits_left_in_chunk = if dif >= 0 then dif else 0 end
47
+ @bits_left_in_byte = if dif < 0 then dif.abs else 0 end
48
+ if @bits_left_in_byte.zero? #next byte
49
+ @current_pos += 1
50
+ @bits_left_in_byte = 8
51
+ end
52
+ end
53
+ chunk
54
+ end
55
+ end
56
+
57
+ CODE_EOD = 257 #end of data
58
+ CODE_CLEAR_TABLE = 256 #clear table
59
+
60
+ # stores de pairs code => string
61
+ class StringTable < Hash # :nodoc:
62
+ attr_reader :string_table_pos
63
+
64
+ def initialize
65
+ super
66
+ @string_table_pos = 258 #initial code
67
+ end
68
+
69
+ #if code less than 258 return fixed string
70
+ def [](key)
71
+ if key > 257 then super else key.chr end
72
+ end
73
+
74
+ def add(string)
75
+ store(@string_table_pos, string)
76
+ @string_table_pos += 1
77
+ end
78
+ end
79
+
80
+ # Decompresses a LZW compressed string.
81
+ #
82
+ def self.decode(data)
83
+ stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
84
+ result = ''
85
+ while not (code = stream.read) == CODE_EOD
86
+ if code == CODE_CLEAR_TABLE
87
+ string_table = StringTable.new
88
+ code = stream.read
89
+ break if code == CODE_EOD
90
+ result << string_table[code]
91
+ old_code = code
92
+ else
93
+ string = string_table[code]
94
+ if string
95
+ result << string
96
+ string_table.add create_new_string(string_table, old_code, code)
97
+ old_code = code
98
+ else
99
+ new_string = create_new_string(string_table, old_code, old_code)
100
+ result << new_string
101
+ string_table.add new_string
102
+ old_code = code
103
+ end
104
+ #increase de size of the codes when limit reached
105
+ case string_table.string_table_pos
106
+ when 511 then stream.set_bits_in_chunk(10)
107
+ when 1023 then stream.set_bits_in_chunk(11)
108
+ when 2047 then stream.set_bits_in_chunk(12)
109
+ end
110
+ end
111
+ end
112
+ result
113
+ end
114
+
115
+ private
116
+
117
+ def self.create_new_string(string_table,some_code, other_code)
118
+ string_table[some_code] + string_table[other_code][0].chr
119
+ end
120
+
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
7
+ #
8
+ class MetadataStrategy < AbstractStrategy # :nodoc:
9
+
10
+ def self.to_sym
11
+ :metadata
12
+ end
13
+
14
+ def process
15
+ return false unless options[:metadata]
16
+
17
+ # may be useful to some people
18
+ callback(:pdf_version, ohash.pdf_version)
19
+
20
+ # ye olde metadata
21
+ callback(:metadata, [decoded_info]) if info?
22
+
23
+ # new style xml metadata
24
+ callback(:xml_metadata, [xml_metadata]) if xml_metadata?
25
+
26
+ # page count
27
+ if pages?
28
+ count = ohash.object(pages[:Count])
29
+ callback(:page_count, count.to_i)
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def xml_metadata
36
+ return @xml_metadata if defined?(@xml_metadata)
37
+
38
+ if root[:Metadata].nil?
39
+ @xml_metadata = nil
40
+ else
41
+ string = ohash.object(root[:Metadata]).unfiltered_data
42
+ string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
43
+ @xml_metadata = string
44
+ end
45
+ end
46
+
47
+ def xml_metadata?
48
+ xml_metadata ? true : false
49
+ end
50
+
51
+ def decoded_info
52
+ @decoded_info ||= decode_strings(info)
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,85 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # A Hash-like object for caching commonly used objects from a PDF file.
6
+ #
7
+ # This is an internal class used by PDF::Reader::ObjectHash
8
+ #
9
+ class ObjectCache # nodoc
10
+
11
+ # These object types use little memory and are accessed a heap of times as
12
+ # part of random page access, so we'll cache the unmarshalled objects and
13
+ # avoid lots of repetitive (and expensive) tokenising
14
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
+
16
+ def initialize
17
+ @objects = {}
18
+ end
19
+
20
+ def [](key)
21
+ @objects[key]
22
+ end
23
+
24
+ def []=(key, value)
25
+ @objects[key] = value if cacheable?(value)
26
+ end
27
+
28
+ def fetch(key, local_default = nil)
29
+ @objects.fetch(key, local_default)
30
+ end
31
+
32
+ def each(&block)
33
+ @objects.each(&block)
34
+ end
35
+ alias :each_pair :each
36
+
37
+ def each_key(&block)
38
+ @objects.each_key(&block)
39
+ end
40
+
41
+ def each_value(&block)
42
+ @objects.each_value(&block)
43
+ end
44
+
45
+ def size
46
+ @objects.size
47
+ end
48
+ alias :length :size
49
+
50
+ def empty?
51
+ @objects.empty?
52
+ end
53
+
54
+ def has_key?(key)
55
+ @objects.has_key?(key)
56
+ end
57
+ alias :include? :has_key?
58
+ alias :key? :has_key?
59
+ alias :member? :has_key?
60
+
61
+ def has_value?(value)
62
+ @objects.has_value?(value)
63
+ end
64
+
65
+ def to_s
66
+ "<PDF::Reader::ObjectCache size: #{self.size}>"
67
+ end
68
+
69
+ def keys
70
+ @objects.keys
71
+ end
72
+
73
+ def values
74
+ @objects.values
75
+ end
76
+
77
+ private
78
+
79
+ def cacheable?(obj)
80
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
+ end
82
+
83
+
84
+ end
85
+ end
@@ -0,0 +1,289 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # Provides low level access to the objects in a PDF file via a hash-like
5
+ # object.
6
+ #
7
+ # A PDF file can be viewed as a large hash map. It is a series of objects
8
+ # stored at precise byte offsets, and a table that maps object IDs to byte
9
+ # offsets. Given an object ID, looking up an object is an O(1) operation.
10
+ #
11
+ # Each PDF object can be mapped to a ruby object, so by passing an object
12
+ # ID to the [] method, a ruby representation of that object will be
13
+ # retrieved.
14
+ #
15
+ # The class behaves much like a standard Ruby hash, including the use of
16
+ # the Enumerable mixin. The key difference is no []= method - the hash
17
+ # is read only.
18
+ #
19
+ # == Basic Usage
20
+ #
21
+ # h = PDF::Reader::ObjectHash.new("somefile.pdf")
22
+ # h[1]
23
+ # => 3469
24
+ #
25
+ # h[PDF::Reader::Reference.new(1,0)]
26
+ # => 3469
27
+ #
28
+ class ObjectHash
29
+ include Enumerable
30
+
31
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
32
+
33
+ attr_accessor :default
34
+ attr_reader :trailer, :pdf_version
35
+
36
+ # Creates a new ObjectHash object. input can be a string with a valid filename,
37
+ # a string containing a PDF file, or an IO object.
38
+ #
39
+ def initialize(input)
40
+ if input.respond_to?(:seek) && input.respond_to?(:read)
41
+ @io = input
42
+ elsif File.file?(input.to_s)
43
+ if File.respond_to?(:binread)
44
+ input = File.binread(input.to_s)
45
+ else
46
+ input = File.read(input.to_s)
47
+ end
48
+ @io = StringIO.new(input)
49
+ else
50
+ raise ArgumentError, "input must be an IO-like object or a filename"
51
+ end
52
+ @pdf_version = read_version
53
+ @xref = PDF::Reader::XRef.new(@io)
54
+ @trailer = @xref.trailer
55
+ @cache = PDF::Reader::ObjectCache.new
56
+
57
+ if trailer[:Encrypt]
58
+ raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
59
+ end
60
+ end
61
+
62
+ # returns the type of object a ref points to
63
+ def obj_type(ref)
64
+ self[ref].class.to_s.to_sym
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ # returns true if the supplied references points to an object with a stream
70
+ def stream?(ref)
71
+ self[ref].class == PDF::Reader::Stream
72
+ rescue
73
+ false
74
+ end
75
+
76
+ # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
77
+ # object.
78
+ #
79
+ # If an int is used, the object with that ID and a generation number of 0 will
80
+ # be returned.
81
+ #
82
+ # If a PDF::Reader::Reference object is used the exact ID and generation number
83
+ # can be specified.
84
+ #
85
+ def [](key)
86
+ return default if key.to_i <= 0
87
+ begin
88
+ unless key.kind_of?(PDF::Reader::Reference)
89
+ key = PDF::Reader::Reference.new(key.to_i, 0)
90
+ end
91
+ if @cache.has_key?(key)
92
+ @cache[key]
93
+ elsif xref[key].is_a?(Fixnum)
94
+ buf = new_buffer(xref[key])
95
+ @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
96
+ elsif xref[key].is_a?(PDF::Reader::Reference)
97
+ container_key = xref[key]
98
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
99
+ @cache[key] = object_streams[container_key][key.id]
100
+ end
101
+ rescue InvalidObjectError
102
+ return default
103
+ end
104
+ end
105
+
106
+ def cacheable?(obj)
107
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
108
+ end
109
+
110
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
111
+ # object in the PDF and return it. Otherwise return key untouched.
112
+ #
113
+ def object(key)
114
+ key.is_a?(PDF::Reader::Reference) ? self[key] : key
115
+ end
116
+ alias :deref :object
117
+
118
+ # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
119
+ # object.
120
+ #
121
+ # If an int is used, the object with that ID and a generation number of 0 will
122
+ # be returned.
123
+ #
124
+ # If a PDF::Reader::Reference object is used the exact ID and generation number
125
+ # can be specified.
126
+ #
127
+ # local_default is the object that will be returned if the requested key doesn't
128
+ # exist.
129
+ #
130
+ def fetch(key, local_default = nil)
131
+ obj = self[key]
132
+ if obj
133
+ return obj
134
+ elsif local_default
135
+ return local_default
136
+ else
137
+ raise IndexError, "#{key} is invalid" if key.to_i <= 0
138
+ end
139
+ end
140
+
141
+ # iterate over each key, value. Just like a ruby hash.
142
+ #
143
+ def each(&block)
144
+ @xref.each do |ref|
145
+ yield ref, self[ref]
146
+ end
147
+ end
148
+ alias :each_pair :each
149
+
150
+ # iterate over each key. Just like a ruby hash.
151
+ #
152
+ def each_key(&block)
153
+ each do |id, obj|
154
+ yield id
155
+ end
156
+ end
157
+
158
+ # iterate over each value. Just like a ruby hash.
159
+ #
160
+ def each_value(&block)
161
+ each do |id, obj|
162
+ yield obj
163
+ end
164
+ end
165
+
166
+ # return the number of objects in the file. An object with multiple generations
167
+ # is counted once.
168
+ def size
169
+ xref.size
170
+ end
171
+ alias :length :size
172
+
173
+ # return true if there are no objects in this file
174
+ #
175
+ def empty?
176
+ size == 0 ? true : false
177
+ end
178
+
179
+ # return true if the specified key exists in the file. key
180
+ # can be an int or a PDF::Reader::Reference
181
+ #
182
+ def has_key?(check_key)
183
+ # TODO update from O(n) to O(1)
184
+ each_key do |key|
185
+ if check_key.kind_of?(PDF::Reader::Reference)
186
+ return true if check_key == key
187
+ else
188
+ return true if check_key.to_i == key.id
189
+ end
190
+ end
191
+ return false
192
+ end
193
+ alias :include? :has_key?
194
+ alias :key? :has_key?
195
+ alias :member? :has_key?
196
+
197
+ # return true if the specifiedvalue exists in the file
198
+ #
199
+ def has_value?(value)
200
+ # TODO update from O(n) to O(1)
201
+ each_value do |obj|
202
+ return true if obj == value
203
+ end
204
+ return false
205
+ end
206
+ alias :value? :has_key?
207
+
208
+ def to_s
209
+ "<PDF::Reader::ObjectHash size: #{self.size}>"
210
+ end
211
+
212
+ # return an array of all keys in the file
213
+ #
214
+ def keys
215
+ ret = []
216
+ each_key { |k| ret << k }
217
+ ret
218
+ end
219
+
220
+ # return an array of all values in the file
221
+ #
222
+ def values
223
+ ret = []
224
+ each_value { |v| ret << v }
225
+ ret
226
+ end
227
+
228
+ # return an array of all values from the specified keys
229
+ #
230
+ def values_at(*ids)
231
+ ids.map { |id| self[id] }
232
+ end
233
+
234
+ # return an array of arrays. Each sub array contains a key/value pair.
235
+ #
236
+ def to_a
237
+ ret = []
238
+ each do |id, obj|
239
+ ret << [id, obj]
240
+ end
241
+ ret
242
+ end
243
+
244
+ # returns an array of PDF::Reader::References. Each reference in the
245
+ # array points a Page object, one for each page in the PDF. The first
246
+ # reference is page 1, second reference is page 2, etc.
247
+ #
248
+ # Useful for apps that want to extract data from specific pages.
249
+ #
250
+ def page_references
251
+ root = fetch(trailer[:Root])
252
+ @page_references ||= get_page_objects(root[:Pages]).flatten
253
+ end
254
+
255
+ private
256
+
257
+ def new_buffer(offset = 0)
258
+ PDF::Reader::Buffer.new(@io, :seek => offset)
259
+ end
260
+
261
+ def xref
262
+ @xref
263
+ end
264
+
265
+ def object_streams
266
+ @object_stream ||= {}
267
+ end
268
+
269
+ # returns a nested array of object references for all pages in this object store.
270
+ #
271
+ def get_page_objects(ref)
272
+ obj = fetch(ref)
273
+
274
+ if obj[:Type] == :Page
275
+ ref
276
+ elsif obj[:Type] == :Pages
277
+ obj[:Kids].map { |kid| get_page_objects(kid) }
278
+ end
279
+ end
280
+
281
+ def read_version
282
+ @io.seek(0)
283
+ m, version = *@io.read(10).match(/PDF-(\d.\d)/)
284
+ @io.seek(0)
285
+ version.to_f
286
+ end
287
+
288
+ end
289
+ end
@@ -0,0 +1,51 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # provides a wrapper around a PDF stream object that contains other objects in it.
6
+ # This is done for added compression and is described as an "Object Stream" in the spec.
7
+ #
8
+ class ObjectStream # :nodoc:
9
+ def initialize(stream)
10
+ @dict = stream.hash
11
+ @data = stream.unfiltered_data
12
+ end
13
+
14
+ def [](objid)
15
+ if offsets[objid].nil?
16
+ nil
17
+ else
18
+ buf = PDF::Reader::Buffer.new(StringIO.new(@data), :seek => offsets[objid])
19
+ parser = PDF::Reader::Parser.new(buf)
20
+ parser.parse_token
21
+ end
22
+ end
23
+
24
+ def size
25
+ @dict[:N]
26
+ end
27
+
28
+ private
29
+
30
+ def offsets
31
+ @offsets ||= {}
32
+ return @offsets if @offsets.keys.size > 0
33
+
34
+ size.times do
35
+ @offsets[buffer.token.to_i] = first + buffer.token.to_i
36
+ end
37
+ @offsets
38
+ end
39
+
40
+ def first
41
+ @dict[:First]
42
+ end
43
+
44
+ def buffer
45
+ @buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+