fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,123 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+
5
+ class Reader
6
+
7
+ # A general class for decoding LZW compressed data. LZW can be
8
+ # used in PDF files to compresses streams, usually for image data sourced
9
+ # from a TIFF file.
10
+ #
11
+ # See the following links for more information:
12
+ #
13
+ # ref http://www.fileformat.info/format/tiff/corion-lzw.htm
14
+ # ref http://marknelson.us/1989/10/01/lzw-data-compression/
15
+ #
16
+ # The PDF spec also has some data on the algorithm.
17
+ #
18
+ class LZW # :nodoc:
19
+
20
+ class BitStream # :nodoc:
21
+
22
+ def initialize(data, bits_in_chunk)
23
+ @data = data
24
+ @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
25
+ @bits_in_chunk = bits_in_chunk
26
+ @current_pos = 0
27
+ @bits_left_in_byte = 8
28
+ end
29
+
30
+ def set_bits_in_chunk(bits_in_chunk)
31
+ @bits_in_chunk = bits_in_chunk
32
+ end
33
+
34
+ def read
35
+ bits_left_in_chunk = @bits_in_chunk
36
+ chunk = nil
37
+ while bits_left_in_chunk > 0 and @current_pos < @data.size
38
+ chunk = 0 if chunk.nil?
39
+ codepoint = @data[@current_pos, 1].unpack("C*")[0]
40
+ current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
41
+ dif = bits_left_in_chunk - @bits_left_in_byte
42
+ if dif > 0 then current_byte <<= dif
43
+ elsif dif < 0 then current_byte >>= dif.abs
44
+ end
45
+ chunk |= current_byte #add bits to result
46
+ bits_left_in_chunk = if dif >= 0 then dif else 0 end
47
+ @bits_left_in_byte = if dif < 0 then dif.abs else 0 end
48
+ if @bits_left_in_byte.zero? #next byte
49
+ @current_pos += 1
50
+ @bits_left_in_byte = 8
51
+ end
52
+ end
53
+ chunk
54
+ end
55
+ end
56
+
57
+ CODE_EOD = 257 #end of data
58
+ CODE_CLEAR_TABLE = 256 #clear table
59
+
60
+ # stores de pairs code => string
61
+ class StringTable < Hash # :nodoc:
62
+ attr_reader :string_table_pos
63
+
64
+ def initialize
65
+ super
66
+ @string_table_pos = 258 #initial code
67
+ end
68
+
69
+ #if code less than 258 return fixed string
70
+ def [](key)
71
+ if key > 257 then super else key.chr end
72
+ end
73
+
74
+ def add(string)
75
+ store(@string_table_pos, string)
76
+ @string_table_pos += 1
77
+ end
78
+ end
79
+
80
+ # Decompresses a LZW compressed string.
81
+ #
82
+ def self.decode(data)
83
+ stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
84
+ result = ''
85
+ while not (code = stream.read) == CODE_EOD
86
+ if code == CODE_CLEAR_TABLE
87
+ string_table = StringTable.new
88
+ code = stream.read
89
+ break if code == CODE_EOD
90
+ result << string_table[code]
91
+ old_code = code
92
+ else
93
+ string = string_table[code]
94
+ if string
95
+ result << string
96
+ string_table.add create_new_string(string_table, old_code, code)
97
+ old_code = code
98
+ else
99
+ new_string = create_new_string(string_table, old_code, old_code)
100
+ result << new_string
101
+ string_table.add new_string
102
+ old_code = code
103
+ end
104
+ #increase de size of the codes when limit reached
105
+ case string_table.string_table_pos
106
+ when 511 then stream.set_bits_in_chunk(10)
107
+ when 1023 then stream.set_bits_in_chunk(11)
108
+ when 2047 then stream.set_bits_in_chunk(12)
109
+ end
110
+ end
111
+ end
112
+ result
113
+ end
114
+
115
+ private
116
+
117
+ def self.create_new_string(string_table,some_code, other_code)
118
+ string_table[some_code] + string_table[other_code][0].chr
119
+ end
120
+
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
7
+ #
8
+ class MetadataStrategy < AbstractStrategy # :nodoc:
9
+
10
+ def self.to_sym
11
+ :metadata
12
+ end
13
+
14
+ def process
15
+ return false unless options[:metadata]
16
+
17
+ # may be useful to some people
18
+ callback(:pdf_version, ohash.pdf_version)
19
+
20
+ # ye olde metadata
21
+ callback(:metadata, [decoded_info]) if info?
22
+
23
+ # new style xml metadata
24
+ callback(:xml_metadata, [xml_metadata]) if xml_metadata?
25
+
26
+ # page count
27
+ if pages?
28
+ count = ohash.object(pages[:Count])
29
+ callback(:page_count, count.to_i)
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def xml_metadata
36
+ return @xml_metadata if defined?(@xml_metadata)
37
+
38
+ if root[:Metadata].nil?
39
+ @xml_metadata = nil
40
+ else
41
+ string = ohash.object(root[:Metadata]).unfiltered_data
42
+ string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
43
+ @xml_metadata = string
44
+ end
45
+ end
46
+
47
+ def xml_metadata?
48
+ xml_metadata ? true : false
49
+ end
50
+
51
+ def decoded_info
52
+ @decoded_info ||= decode_strings(info)
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,85 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # A Hash-like object for caching commonly used objects from a PDF file.
6
+ #
7
+ # This is an internal class used by PDF::Reader::ObjectHash
8
+ #
9
+ class ObjectCache # nodoc
10
+
11
+ # These object types use little memory and are accessed a heap of times as
12
+ # part of random page access, so we'll cache the unmarshalled objects and
13
+ # avoid lots of repetitive (and expensive) tokenising
14
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
+
16
+ def initialize
17
+ @objects = {}
18
+ end
19
+
20
+ def [](key)
21
+ @objects[key]
22
+ end
23
+
24
+ def []=(key, value)
25
+ @objects[key] = value if cacheable?(value)
26
+ end
27
+
28
+ def fetch(key, local_default = nil)
29
+ @objects.fetch(key, local_default)
30
+ end
31
+
32
+ def each(&block)
33
+ @objects.each(&block)
34
+ end
35
+ alias :each_pair :each
36
+
37
+ def each_key(&block)
38
+ @objects.each_key(&block)
39
+ end
40
+
41
+ def each_value(&block)
42
+ @objects.each_value(&block)
43
+ end
44
+
45
+ def size
46
+ @objects.size
47
+ end
48
+ alias :length :size
49
+
50
+ def empty?
51
+ @objects.empty?
52
+ end
53
+
54
+ def has_key?(key)
55
+ @objects.has_key?(key)
56
+ end
57
+ alias :include? :has_key?
58
+ alias :key? :has_key?
59
+ alias :member? :has_key?
60
+
61
+ def has_value?(value)
62
+ @objects.has_value?(value)
63
+ end
64
+
65
+ def to_s
66
+ "<PDF::Reader::ObjectCache size: #{self.size}>"
67
+ end
68
+
69
+ def keys
70
+ @objects.keys
71
+ end
72
+
73
+ def values
74
+ @objects.values
75
+ end
76
+
77
+ private
78
+
79
+ def cacheable?(obj)
80
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
+ end
82
+
83
+
84
+ end
85
+ end
@@ -0,0 +1,289 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # Provides low level access to the objects in a PDF file via a hash-like
5
+ # object.
6
+ #
7
+ # A PDF file can be viewed as a large hash map. It is a series of objects
8
+ # stored at precise byte offsets, and a table that maps object IDs to byte
9
+ # offsets. Given an object ID, looking up an object is an O(1) operation.
10
+ #
11
+ # Each PDF object can be mapped to a ruby object, so by passing an object
12
+ # ID to the [] method, a ruby representation of that object will be
13
+ # retrieved.
14
+ #
15
+ # The class behaves much like a standard Ruby hash, including the use of
16
+ # the Enumerable mixin. The key difference is no []= method - the hash
17
+ # is read only.
18
+ #
19
+ # == Basic Usage
20
+ #
21
+ # h = PDF::Reader::ObjectHash.new("somefile.pdf")
22
+ # h[1]
23
+ # => 3469
24
+ #
25
+ # h[PDF::Reader::Reference.new(1,0)]
26
+ # => 3469
27
+ #
28
+ class ObjectHash
29
+ include Enumerable
30
+
31
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
32
+
33
+ attr_accessor :default
34
+ attr_reader :trailer, :pdf_version
35
+
36
+ # Creates a new ObjectHash object. input can be a string with a valid filename,
37
+ # a string containing a PDF file, or an IO object.
38
+ #
39
+ def initialize(input)
40
+ if input.respond_to?(:seek) && input.respond_to?(:read)
41
+ @io = input
42
+ elsif File.file?(input.to_s)
43
+ if File.respond_to?(:binread)
44
+ input = File.binread(input.to_s)
45
+ else
46
+ input = File.read(input.to_s)
47
+ end
48
+ @io = StringIO.new(input)
49
+ else
50
+ raise ArgumentError, "input must be an IO-like object or a filename"
51
+ end
52
+ @pdf_version = read_version
53
+ @xref = PDF::Reader::XRef.new(@io)
54
+ @trailer = @xref.trailer
55
+ @cache = PDF::Reader::ObjectCache.new
56
+
57
+ if trailer[:Encrypt]
58
+ raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
59
+ end
60
+ end
61
+
62
+ # returns the type of object a ref points to
63
+ def obj_type(ref)
64
+ self[ref].class.to_s.to_sym
65
+ rescue
66
+ nil
67
+ end
68
+
69
+ # returns true if the supplied references points to an object with a stream
70
+ def stream?(ref)
71
+ self[ref].class == PDF::Reader::Stream
72
+ rescue
73
+ false
74
+ end
75
+
76
+ # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
77
+ # object.
78
+ #
79
+ # If an int is used, the object with that ID and a generation number of 0 will
80
+ # be returned.
81
+ #
82
+ # If a PDF::Reader::Reference object is used the exact ID and generation number
83
+ # can be specified.
84
+ #
85
+ def [](key)
86
+ return default if key.to_i <= 0
87
+ begin
88
+ unless key.kind_of?(PDF::Reader::Reference)
89
+ key = PDF::Reader::Reference.new(key.to_i, 0)
90
+ end
91
+ if @cache.has_key?(key)
92
+ @cache[key]
93
+ elsif xref[key].is_a?(Fixnum)
94
+ buf = new_buffer(xref[key])
95
+ @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
96
+ elsif xref[key].is_a?(PDF::Reader::Reference)
97
+ container_key = xref[key]
98
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
99
+ @cache[key] = object_streams[container_key][key.id]
100
+ end
101
+ rescue InvalidObjectError
102
+ return default
103
+ end
104
+ end
105
+
106
+ def cacheable?(obj)
107
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
108
+ end
109
+
110
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
111
+ # object in the PDF and return it. Otherwise return key untouched.
112
+ #
113
+ def object(key)
114
+ key.is_a?(PDF::Reader::Reference) ? self[key] : key
115
+ end
116
+ alias :deref :object
117
+
118
+ # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
119
+ # object.
120
+ #
121
+ # If an int is used, the object with that ID and a generation number of 0 will
122
+ # be returned.
123
+ #
124
+ # If a PDF::Reader::Reference object is used the exact ID and generation number
125
+ # can be specified.
126
+ #
127
+ # local_default is the object that will be returned if the requested key doesn't
128
+ # exist.
129
+ #
130
+ def fetch(key, local_default = nil)
131
+ obj = self[key]
132
+ if obj
133
+ return obj
134
+ elsif local_default
135
+ return local_default
136
+ else
137
+ raise IndexError, "#{key} is invalid" if key.to_i <= 0
138
+ end
139
+ end
140
+
141
+ # iterate over each key, value. Just like a ruby hash.
142
+ #
143
+ def each(&block)
144
+ @xref.each do |ref|
145
+ yield ref, self[ref]
146
+ end
147
+ end
148
+ alias :each_pair :each
149
+
150
+ # iterate over each key. Just like a ruby hash.
151
+ #
152
+ def each_key(&block)
153
+ each do |id, obj|
154
+ yield id
155
+ end
156
+ end
157
+
158
+ # iterate over each value. Just like a ruby hash.
159
+ #
160
+ def each_value(&block)
161
+ each do |id, obj|
162
+ yield obj
163
+ end
164
+ end
165
+
166
+ # return the number of objects in the file. An object with multiple generations
167
+ # is counted once.
168
+ def size
169
+ xref.size
170
+ end
171
+ alias :length :size
172
+
173
+ # return true if there are no objects in this file
174
+ #
175
+ def empty?
176
+ size == 0 ? true : false
177
+ end
178
+
179
+ # return true if the specified key exists in the file. key
180
+ # can be an int or a PDF::Reader::Reference
181
+ #
182
+ def has_key?(check_key)
183
+ # TODO update from O(n) to O(1)
184
+ each_key do |key|
185
+ if check_key.kind_of?(PDF::Reader::Reference)
186
+ return true if check_key == key
187
+ else
188
+ return true if check_key.to_i == key.id
189
+ end
190
+ end
191
+ return false
192
+ end
193
+ alias :include? :has_key?
194
+ alias :key? :has_key?
195
+ alias :member? :has_key?
196
+
197
+ # return true if the specifiedvalue exists in the file
198
+ #
199
+ def has_value?(value)
200
+ # TODO update from O(n) to O(1)
201
+ each_value do |obj|
202
+ return true if obj == value
203
+ end
204
+ return false
205
+ end
206
+ alias :value? :has_key?
207
+
208
+ def to_s
209
+ "<PDF::Reader::ObjectHash size: #{self.size}>"
210
+ end
211
+
212
+ # return an array of all keys in the file
213
+ #
214
+ def keys
215
+ ret = []
216
+ each_key { |k| ret << k }
217
+ ret
218
+ end
219
+
220
+ # return an array of all values in the file
221
+ #
222
+ def values
223
+ ret = []
224
+ each_value { |v| ret << v }
225
+ ret
226
+ end
227
+
228
+ # return an array of all values from the specified keys
229
+ #
230
+ def values_at(*ids)
231
+ ids.map { |id| self[id] }
232
+ end
233
+
234
+ # return an array of arrays. Each sub array contains a key/value pair.
235
+ #
236
+ def to_a
237
+ ret = []
238
+ each do |id, obj|
239
+ ret << [id, obj]
240
+ end
241
+ ret
242
+ end
243
+
244
+ # returns an array of PDF::Reader::References. Each reference in the
245
+ # array points a Page object, one for each page in the PDF. The first
246
+ # reference is page 1, second reference is page 2, etc.
247
+ #
248
+ # Useful for apps that want to extract data from specific pages.
249
+ #
250
+ def page_references
251
+ root = fetch(trailer[:Root])
252
+ @page_references ||= get_page_objects(root[:Pages]).flatten
253
+ end
254
+
255
+ private
256
+
257
+ def new_buffer(offset = 0)
258
+ PDF::Reader::Buffer.new(@io, :seek => offset)
259
+ end
260
+
261
+ def xref
262
+ @xref
263
+ end
264
+
265
+ def object_streams
266
+ @object_stream ||= {}
267
+ end
268
+
269
+ # returns a nested array of object references for all pages in this object store.
270
+ #
271
+ def get_page_objects(ref)
272
+ obj = fetch(ref)
273
+
274
+ if obj[:Type] == :Page
275
+ ref
276
+ elsif obj[:Type] == :Pages
277
+ obj[:Kids].map { |kid| get_page_objects(kid) }
278
+ end
279
+ end
280
+
281
+ def read_version
282
+ @io.seek(0)
283
+ m, version = *@io.read(10).match(/PDF-(\d.\d)/)
284
+ @io.seek(0)
285
+ version.to_f
286
+ end
287
+
288
+ end
289
+ end
@@ -0,0 +1,51 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # provides a wrapper around a PDF stream object that contains other objects in it.
6
+ # This is done for added compression and is described as an "Object Stream" in the spec.
7
+ #
8
+ class ObjectStream # :nodoc:
9
+ def initialize(stream)
10
+ @dict = stream.hash
11
+ @data = stream.unfiltered_data
12
+ end
13
+
14
+ def [](objid)
15
+ if offsets[objid].nil?
16
+ nil
17
+ else
18
+ buf = PDF::Reader::Buffer.new(StringIO.new(@data), :seek => offsets[objid])
19
+ parser = PDF::Reader::Parser.new(buf)
20
+ parser.parse_token
21
+ end
22
+ end
23
+
24
+ def size
25
+ @dict[:N]
26
+ end
27
+
28
+ private
29
+
30
+ def offsets
31
+ @offsets ||= {}
32
+ return @offsets if @offsets.keys.size > 0
33
+
34
+ size.times do
35
+ @offsets[buffer.token.to_i] = first + buffer.token.to_i
36
+ end
37
+ @offsets
38
+ end
39
+
40
+ def first
41
+ @dict[:First]
42
+ end
43
+
44
+ def buffer
45
+ @buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+