fireinc-pdf-reader 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,220 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that represents the XRef table in a PDF file as a
29
+ # hash-like object.
30
+ #
31
+ # An Xref table is a map of object identifiers and byte offsets. Any time a particular
32
+ # object needs to be found, the Xref table is used to find where it is stored in the
33
+ # file.
34
+ #
35
+ # Hash keys are object ids, values are either:
36
+ #
37
+ # * a byte offset where the object starts (regular PDF objects)
38
+ # * a PDF::Reader::Reference instance that points to a stream that contains the
39
+ # desired object (PDF objects embedded in an object stream)
40
+ #
41
+ # The class behaves much like a standard Ruby hash, including the use of
42
+ # the Enumerable mixin. The key difference is no []= method - the hash
43
+ # is read only.
44
+ #
45
+ class XRef
46
+ include Enumerable
47
+ attr_reader :trailer
48
+
49
+ ################################################################################
50
+ # create a new Xref table based on the contents of the supplied io object
51
+ #
52
+ # io - must be an IO object, generally either a file or a StringIO
53
+ #
54
+ def initialize (io)
55
+ @io = io
56
+ @xref = {}
57
+ @trailer = load_offsets
58
+ end
59
+ ################################################################################
60
+ # return the number of objects in this file. Objects with multiple generations are
61
+ # only counter once.
62
+ def size
63
+ @xref.size
64
+ end
65
+ ################################################################################
66
+ # returns the byte offset for the specified PDF object.
67
+ #
68
+ # ref - a PDF::Reader::Reference object containing an object ID and revision number
69
+ def [](ref)
70
+ @xref[ref.id][ref.gen]
71
+ rescue
72
+ raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
73
+ end
74
+ ################################################################################
75
+ # iterate over each object in the xref table
76
+ def each(&block)
77
+ ids = @xref.keys.sort
78
+ ids.each do |id|
79
+ gen = @xref[id].keys.sort[-1]
80
+ yield PDF::Reader::Reference.new(id, gen)
81
+ end
82
+ end
83
+ ################################################################################
84
+ private
85
+ ################################################################################
86
+ # Read a xref table from the underlying buffer.
87
+ #
88
+ # If offset is specified the table will be loaded from there, otherwise the
89
+ # default offset will be located and used.
90
+ #
91
+ # After seeking to the offset, processing is handed of to either load_xref_table()
92
+ # or load_xref_stream() based on what we find there.
93
+ #
94
+ def load_offsets(offset = nil)
95
+ offset ||= new_buffer.find_first_xref_offset
96
+
97
+ buf = new_buffer(offset)
98
+ tok_one = buf.token
99
+
100
+ return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
101
+
102
+ tok_two = buf.token
103
+ tok_three = buf.token
104
+
105
+ if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
106
+ buf = new_buffer(offset)
107
+ stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
108
+ return load_xref_stream(stream)
109
+ end
110
+
111
+ raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
112
+ end
113
+ ################################################################################
114
+ # Assumes the underlying buffer is positioned at the start of a traditional
115
+ # Xref table and processes it into memory.
116
+ def load_xref_table(buf)
117
+ params = []
118
+
119
+ while !params.include?("trailer") && !params.include?(nil)
120
+ if params.size == 2
121
+ objid, count = params[0].to_i, params[1].to_i
122
+ count.times do
123
+ offset = buf.token.to_i
124
+ generation = buf.token.to_i
125
+ state = buf.token
126
+
127
+ store(objid, generation, offset) if state == "n"
128
+ objid += 1
129
+ params.clear
130
+ end
131
+ end
132
+ params << buf.token
133
+ end
134
+
135
+ trailer = Parser.new(buf, self).parse_token
136
+
137
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
138
+
139
+ load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
140
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
141
+
142
+ trailer
143
+ end
144
+
145
+ ################################################################################
146
+ # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
147
+ #
148
+ def load_xref_stream(stream)
149
+ unless stream.hash[:Type] == :XRef
150
+ raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
151
+ end
152
+ trailer = Hash[stream.hash.select { |key, value|
153
+ [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
154
+ }]
155
+
156
+ widths = stream.hash[:W]
157
+ entry_length = widths.inject(0) { |s, w| s + w }
158
+ raw_data = StringIO.new(stream.unfiltered_data)
159
+ if stream.hash[:Index]
160
+ index = stream.hash[:Index]
161
+ else
162
+ index = [0, stream.hash[:Size]]
163
+ end
164
+ index.each_slice(2) do |start_id, size|
165
+ obj_ids = (start_id..(start_id+(size-1)))
166
+ obj_ids.each do |objid|
167
+ entry = raw_data.read(entry_length) || ""
168
+ f1 = unpack_bytes(entry[0,widths[0]])
169
+ f2 = unpack_bytes(entry[widths[0],widths[1]])
170
+ f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
171
+ if f1 == 1 && f2 > 0
172
+ store(objid, f3, f2)
173
+ elsif f1 == 2 && f2 > 0
174
+ store(objid, 0, PDF::Reader::Reference.new(f2, 0))
175
+ end
176
+ end
177
+ end
178
+
179
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
180
+
181
+ trailer
182
+ end
183
+ ################################################################################
184
+ # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
185
+ # bytes they need to be converted to an int in different ways.
186
+ #
187
+ def unpack_bytes(bytes)
188
+ if bytes.to_s.size == 0
189
+ 0
190
+ elsif bytes.size == 1
191
+ bytes.unpack("C")[0]
192
+ elsif bytes.size == 2
193
+ bytes.unpack("n")[0]
194
+ elsif bytes.size == 3
195
+ ("\x00" + bytes).unpack("N")[0]
196
+ elsif bytes.size == 4
197
+ bytes.unpack("N")[0]
198
+ else
199
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
200
+ end
201
+ end
202
+ ################################################################################
203
+ # Wrap the io stream we're working with in a buffer that can tokenise it for us.
204
+ #
205
+ # We create multiple buffers so we can be tokenising multiple sections of the file
206
+ # at the same time without worring about clearing the buffers contents.
207
+ #
208
+ def new_buffer(offset = 0)
209
+ PDF::Reader::Buffer.new(@io, :seek => offset)
210
+ end
211
+ ################################################################################
212
+ # Stores an offset value for a particular PDF object ID and revision number
213
+ #
214
+ def store (id, gen, offset)
215
+ (@xref[id] ||= {})[gen] ||= offset
216
+ end
217
+ end
218
+ ################################################################################
219
+ end
220
+ ################################################################################
data/lib/pdf/reader.rb ADDED
@@ -0,0 +1,296 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ # Copyright (C) 2011 James Healy
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+ ################################################################################
26
+
27
+ require 'stringio'
28
+ require 'zlib'
29
+
30
+ require 'ascii85'
31
+
32
+ module PDF
33
+ ################################################################################
34
+ # The Reader class serves as an entry point for parsing a PDF file.
35
+ #
36
+ # PDF is a page based file format. There is some data associated with the
37
+ # document (metadata, bookmarks, etc) but all visible content is stored
38
+ # under a Page object.
39
+ #
40
+ # In most use cases for extracting and examining the contents of a PDF it
41
+ # makes sense to traverse the information using page based iteration.
42
+ #
43
+ # In addition to the documentation here, check out the
44
+ # PDF::Reader::Page class.
45
+ #
46
+ # == File Metadata
47
+ #
48
+ # reader = PDF::Reader.new("somefile.pdf")
49
+ #
50
+ # puts reader.pdf_version
51
+ # puts reader.info
52
+ # puts reader.metadata
53
+ # puts reader.page_count
54
+ #
55
+ # == Iterating over page content
56
+ #
57
+ # reader = PDF::Reader.new("somefile.pdf")
58
+ #
59
+ # reader.pages.each do |page|
60
+ # puts page.fonts
61
+ # puts page.images
62
+ # puts page.text
63
+ # end
64
+ #
65
+ # == Extracting all text
66
+ #
67
+ # reader = PDF::Reader.new("somefile.pdf")
68
+ #
69
+ # reader.pages.map(&:text)
70
+ #
71
+ # == Extracting content from a single page
72
+ #
73
+ # reader = PDF::Reader.new("somefile.pdf")
74
+ #
75
+ # page = reader.page(1)
76
+ # puts page.fonts
77
+ # puts page.images
78
+ # puts page.text
79
+ #
80
+ # == Low level callbacks (ala current version of PDF::Reader)
81
+ #
82
+ # reader = PDF::Reader.new("somefile.pdf")
83
+ #
84
+ # page = reader.page(1)
85
+ # page.walk(receiver)
86
+ #
87
+ class Reader
88
+
89
+ # lowlevel hash-like access to all objects in the underlying PDF
90
+ attr_reader :objects
91
+
92
+ attr_reader :page_count, :pdf_version, :info, :metadata
93
+
94
+ # creates a new document reader for the provided PDF.
95
+ #
96
+ # input can be an IO-ish object (StringIO, File, etc) containing a PDF
97
+ # or a filename
98
+ #
99
+ # reader = PDF::Reader.new("somefile.pdf")
100
+ #
101
+ # File.open("somefile.pdf","rb") do |file|
102
+ # reader = PDF::Reader.new(file)
103
+ # end
104
+ #
105
+ def initialize(input = nil)
106
+ if input # support the deprecated Reader API
107
+ @objects = PDF::Reader::ObjectHash.new(input)
108
+ @page_count = get_page_count
109
+ @pdf_version = @objects.pdf_version
110
+ @info = @objects.deref(@objects.trailer[:Info])
111
+ @metadata = get_metadata
112
+ end
113
+ end
114
+
115
+ # syntactic sugar for opening a PDF file. Accepts the same arguments
116
+ # as new().
117
+ #
118
+ # PDF::Reader.open("somefile.pdf") do |reader|
119
+ # puts reader.pdf_version
120
+ # end
121
+ #
122
+ def self.open(input, &block)
123
+ yield PDF::Reader.new(input)
124
+ end
125
+
126
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
127
+ # eventually be removed
128
+ #
129
+ #
130
+ # Parse the file with the given name, sending events to the given receiver.
131
+ #
132
+ def self.file(name, receivers, opts = {})
133
+ File.open(name,"rb") do |f|
134
+ new.parse(f, receivers, opts)
135
+ end
136
+ end
137
+
138
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
139
+ # eventually be removed
140
+ #
141
+ # Parse the given string, sending events to the given receiver.
142
+ #
143
+ def self.string(str, receivers, opts = {})
144
+ StringIO.open(str) do |s|
145
+ new.parse(s, receivers, opts)
146
+ end
147
+ end
148
+
149
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
150
+ # eventually be removed
151
+ #
152
+ # Parse the file with the given name, returning an unmarshalled ruby version of
153
+ # represents the requested pdf object
154
+ #
155
+ def self.object_file(name, id, gen = 0)
156
+ File.open(name,"rb") { |f|
157
+ new.object(f, id.to_i, gen.to_i)
158
+ }
159
+ end
160
+
161
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
162
+ # eventually be removed
163
+ #
164
+ # Parse the given string, returning an unmarshalled ruby version of represents
165
+ # the requested pdf object
166
+ #
167
+ def self.object_string(str, id, gen = 0)
168
+ StringIO.open(str) { |s|
169
+ new.object(s, id.to_i, gen.to_i)
170
+ }
171
+ end
172
+
173
+ # returns an array of PDF::Reader::Page objects, one for each
174
+ # page in the source PDF.
175
+ #
176
+ # reader = PDF::Reader.new("somefile.pdf")
177
+ #
178
+ # reader.pages.each do |page|
179
+ # puts page.fonts
180
+ # puts page.images
181
+ # puts page.text
182
+ # end
183
+ #
184
+ # See the docs for PDF::Reader::Page to read more about the
185
+ # methods available on each page
186
+ #
187
+ def pages
188
+ (1..@page_count).map { |num|
189
+ PDF::Reader::Page.new(@objects, num)
190
+ }
191
+ end
192
+
193
+ # returns a single PDF::Reader::Page for the specified page.
194
+ # Use this instead of pages method when you need to access just a single
195
+ # page
196
+ #
197
+ # reader = PDF::Reader.new("somefile.pdf")
198
+ # page = reader.page(10)
199
+ #
200
+ # puts page.text
201
+ #
202
+ # See the docs for PDF::Reader::Page to read more about the
203
+ # methods available on each page
204
+ #
205
+ def page(num)
206
+ num = num.to_i
207
+ raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
208
+ PDF::Reader::Page.new(@objects, num)
209
+ end
210
+
211
+
212
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
213
+ # eventually be removed
214
+ #
215
+ # Given an IO object that contains PDF data, parse it.
216
+ #
217
+ def parse(io, receivers, opts = {})
218
+ ohash = ObjectHash.new(io)
219
+
220
+ if ohash.trailer[:Encrypt]
221
+ raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
222
+ end
223
+
224
+ options = {:pages => true, :raw_text => false, :metadata => true}
225
+ options.merge!(opts)
226
+
227
+ strategies.each do |s|
228
+ s.new(ohash, receivers, options).process
229
+ end
230
+
231
+ self
232
+ end
233
+
234
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
235
+ # eventually be removed
236
+ #
237
+ # Given an IO object that contains PDF data, return the contents of a single object
238
+ #
239
+ def object (io, id, gen)
240
+ @objects = ObjectHash.new(io)
241
+
242
+ @objects.deref(Reference.new(id, gen))
243
+ end
244
+
245
+ private
246
+
247
+ def strategies
248
+ @strategies ||= [
249
+ ::PDF::Reader::MetadataStrategy,
250
+ ::PDF::Reader::PagesStrategy
251
+ ]
252
+ end
253
+
254
+ def root
255
+ root ||= @objects.deref(@objects.trailer[:Root])
256
+ end
257
+
258
+ def get_metadata
259
+ stream = @objects.deref(root[:Metadata])
260
+ stream ? stream.unfiltered_data : nil
261
+ end
262
+
263
+ def get_page_count
264
+ pages = @objects.deref(root[:Pages])
265
+ pages[:Count]
266
+ end
267
+
268
+ end
269
+ end
270
+ ################################################################################
271
+
272
+ require 'pdf/reader/abstract_strategy'
273
+ require 'pdf/reader/buffer'
274
+ require 'pdf/reader/cmap'
275
+ require 'pdf/reader/encoding'
276
+ require 'pdf/reader/error'
277
+ require 'pdf/reader/filter'
278
+ require 'pdf/reader/font'
279
+ require 'pdf/reader/form_xobject'
280
+ require 'pdf/reader/lzw'
281
+ require 'pdf/reader/metadata_strategy'
282
+ require 'pdf/reader/object_cache'
283
+ require 'pdf/reader/object_hash'
284
+ require 'pdf/reader/object_stream'
285
+ require 'pdf/reader/pages_strategy'
286
+ require 'pdf/reader/parser'
287
+ require 'pdf/reader/print_receiver'
288
+ require 'pdf/reader/reference'
289
+ require 'pdf/reader/register_receiver'
290
+ require 'pdf/reader/stream'
291
+ require 'pdf/reader/text_receiver'
292
+ require 'pdf/reader/page_text_receiver'
293
+ require 'pdf/reader/token'
294
+ require 'pdf/reader/xref'
295
+ require 'pdf/reader/page'
296
+ require 'pdf/hash'
data/lib/pdf-reader.rb ADDED
@@ -0,0 +1 @@
1
+ require "pdf/reader"