fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,220 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that represents the XRef table in a PDF file as a
29
+ # hash-like object.
30
+ #
31
+ # An Xref table is a map of object identifiers and byte offsets. Any time a particular
32
+ # object needs to be found, the Xref table is used to find where it is stored in the
33
+ # file.
34
+ #
35
+ # Hash keys are object ids, values are either:
36
+ #
37
+ # * a byte offset where the object starts (regular PDF objects)
38
+ # * a PDF::Reader::Reference instance that points to a stream that contains the
39
+ # desired object (PDF objects embedded in an object stream)
40
+ #
41
+ # The class behaves much like a standard Ruby hash, including the use of
42
+ # the Enumerable mixin. The key difference is no []= method - the hash
43
+ # is read only.
44
+ #
45
+ class XRef
46
+ include Enumerable
47
+ attr_reader :trailer
48
+
49
+ ################################################################################
50
+ # create a new Xref table based on the contents of the supplied io object
51
+ #
52
+ # io - must be an IO object, generally either a file or a StringIO
53
+ #
54
+ def initialize (io)
55
+ @io = io
56
+ @xref = {}
57
+ @trailer = load_offsets
58
+ end
59
+ ################################################################################
60
+ # return the number of objects in this file. Objects with multiple generations are
61
+ # only counter once.
62
+ def size
63
+ @xref.size
64
+ end
65
+ ################################################################################
66
+ # returns the byte offset for the specified PDF object.
67
+ #
68
+ # ref - a PDF::Reader::Reference object containing an object ID and revision number
69
+ def [](ref)
70
+ @xref[ref.id][ref.gen]
71
+ rescue
72
+ raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
73
+ end
74
+ ################################################################################
75
+ # iterate over each object in the xref table
76
+ def each(&block)
77
+ ids = @xref.keys.sort
78
+ ids.each do |id|
79
+ gen = @xref[id].keys.sort[-1]
80
+ yield PDF::Reader::Reference.new(id, gen)
81
+ end
82
+ end
83
+ ################################################################################
84
+ private
85
+ ################################################################################
86
+ # Read a xref table from the underlying buffer.
87
+ #
88
+ # If offset is specified the table will be loaded from there, otherwise the
89
+ # default offset will be located and used.
90
+ #
91
+ # After seeking to the offset, processing is handed of to either load_xref_table()
92
+ # or load_xref_stream() based on what we find there.
93
+ #
94
+ def load_offsets(offset = nil)
95
+ offset ||= new_buffer.find_first_xref_offset
96
+
97
+ buf = new_buffer(offset)
98
+ tok_one = buf.token
99
+
100
+ return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
101
+
102
+ tok_two = buf.token
103
+ tok_three = buf.token
104
+
105
+ if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
106
+ buf = new_buffer(offset)
107
+ stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
108
+ return load_xref_stream(stream)
109
+ end
110
+
111
+ raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
112
+ end
113
+ ################################################################################
114
+ # Assumes the underlying buffer is positioned at the start of a traditional
115
+ # Xref table and processes it into memory.
116
+ def load_xref_table(buf)
117
+ params = []
118
+
119
+ while !params.include?("trailer") && !params.include?(nil)
120
+ if params.size == 2
121
+ objid, count = params[0].to_i, params[1].to_i
122
+ count.times do
123
+ offset = buf.token.to_i
124
+ generation = buf.token.to_i
125
+ state = buf.token
126
+
127
+ store(objid, generation, offset) if state == "n"
128
+ objid += 1
129
+ params.clear
130
+ end
131
+ end
132
+ params << buf.token
133
+ end
134
+
135
+ trailer = Parser.new(buf, self).parse_token
136
+
137
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
138
+
139
+ load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
140
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
141
+
142
+ trailer
143
+ end
144
+
145
+ ################################################################################
146
+ # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
147
+ #
148
+ def load_xref_stream(stream)
149
+ unless stream.hash[:Type] == :XRef
150
+ raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
151
+ end
152
+ trailer = Hash[stream.hash.select { |key, value|
153
+ [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
154
+ }]
155
+
156
+ widths = stream.hash[:W]
157
+ entry_length = widths.inject(0) { |s, w| s + w }
158
+ raw_data = StringIO.new(stream.unfiltered_data)
159
+ if stream.hash[:Index]
160
+ index = stream.hash[:Index]
161
+ else
162
+ index = [0, stream.hash[:Size]]
163
+ end
164
+ index.each_slice(2) do |start_id, size|
165
+ obj_ids = (start_id..(start_id+(size-1)))
166
+ obj_ids.each do |objid|
167
+ entry = raw_data.read(entry_length) || ""
168
+ f1 = unpack_bytes(entry[0,widths[0]])
169
+ f2 = unpack_bytes(entry[widths[0],widths[1]])
170
+ f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
171
+ if f1 == 1 && f2 > 0
172
+ store(objid, f3, f2)
173
+ elsif f1 == 2 && f2 > 0
174
+ store(objid, 0, PDF::Reader::Reference.new(f2, 0))
175
+ end
176
+ end
177
+ end
178
+
179
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
180
+
181
+ trailer
182
+ end
183
+ ################################################################################
184
+ # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
185
+ # bytes they need to be converted to an int in different ways.
186
+ #
187
+ def unpack_bytes(bytes)
188
+ if bytes.to_s.size == 0
189
+ 0
190
+ elsif bytes.size == 1
191
+ bytes.unpack("C")[0]
192
+ elsif bytes.size == 2
193
+ bytes.unpack("n")[0]
194
+ elsif bytes.size == 3
195
+ ("\x00" + bytes).unpack("N")[0]
196
+ elsif bytes.size == 4
197
+ bytes.unpack("N")[0]
198
+ else
199
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
200
+ end
201
+ end
202
+ ################################################################################
203
+ # Wrap the io stream we're working with in a buffer that can tokenise it for us.
204
+ #
205
+ # We create multiple buffers so we can be tokenising multiple sections of the file
206
+ # at the same time without worring about clearing the buffers contents.
207
+ #
208
+ def new_buffer(offset = 0)
209
+ PDF::Reader::Buffer.new(@io, :seek => offset)
210
+ end
211
+ ################################################################################
212
+ # Stores an offset value for a particular PDF object ID and revision number
213
+ #
214
+ def store (id, gen, offset)
215
+ (@xref[id] ||= {})[gen] ||= offset
216
+ end
217
+ end
218
+ ################################################################################
219
+ end
220
+ ################################################################################
data/lib/pdf/reader.rb ADDED
@@ -0,0 +1,296 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ # Copyright (C) 2011 James Healy
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # "Software"), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+ ################################################################################
26
+
27
+ require 'stringio'
28
+ require 'zlib'
29
+
30
+ require 'ascii85'
31
+
32
+ module PDF
33
+ ################################################################################
34
+ # The Reader class serves as an entry point for parsing a PDF file.
35
+ #
36
+ # PDF is a page based file format. There is some data associated with the
37
+ # document (metadata, bookmarks, etc) but all visible content is stored
38
+ # under a Page object.
39
+ #
40
+ # In most use cases for extracting and examining the contents of a PDF it
41
+ # makes sense to traverse the information using page based iteration.
42
+ #
43
+ # In addition to the documentation here, check out the
44
+ # PDF::Reader::Page class.
45
+ #
46
+ # == File Metadata
47
+ #
48
+ # reader = PDF::Reader.new("somefile.pdf")
49
+ #
50
+ # puts reader.pdf_version
51
+ # puts reader.info
52
+ # puts reader.metadata
53
+ # puts reader.page_count
54
+ #
55
+ # == Iterating over page content
56
+ #
57
+ # reader = PDF::Reader.new("somefile.pdf")
58
+ #
59
+ # reader.pages.each do |page|
60
+ # puts page.fonts
61
+ # puts page.images
62
+ # puts page.text
63
+ # end
64
+ #
65
+ # == Extracting all text
66
+ #
67
+ # reader = PDF::Reader.new("somefile.pdf")
68
+ #
69
+ # reader.pages.map(&:text)
70
+ #
71
+ # == Extracting content from a single page
72
+ #
73
+ # reader = PDF::Reader.new("somefile.pdf")
74
+ #
75
+ # page = reader.page(1)
76
+ # puts page.fonts
77
+ # puts page.images
78
+ # puts page.text
79
+ #
80
+ # == Low level callbacks (ala current version of PDF::Reader)
81
+ #
82
+ # reader = PDF::Reader.new("somefile.pdf")
83
+ #
84
+ # page = reader.page(1)
85
+ # page.walk(receiver)
86
+ #
87
+ class Reader
88
+
89
+ # lowlevel hash-like access to all objects in the underlying PDF
90
+ attr_reader :objects
91
+
92
+ attr_reader :page_count, :pdf_version, :info, :metadata
93
+
94
+ # creates a new document reader for the provided PDF.
95
+ #
96
+ # input can be an IO-ish object (StringIO, File, etc) containing a PDF
97
+ # or a filename
98
+ #
99
+ # reader = PDF::Reader.new("somefile.pdf")
100
+ #
101
+ # File.open("somefile.pdf","rb") do |file|
102
+ # reader = PDF::Reader.new(file)
103
+ # end
104
+ #
105
+ def initialize(input = nil)
106
+ if input # support the deprecated Reader API
107
+ @objects = PDF::Reader::ObjectHash.new(input)
108
+ @page_count = get_page_count
109
+ @pdf_version = @objects.pdf_version
110
+ @info = @objects.deref(@objects.trailer[:Info])
111
+ @metadata = get_metadata
112
+ end
113
+ end
114
+
115
+ # syntactic sugar for opening a PDF file. Accepts the same arguments
116
+ # as new().
117
+ #
118
+ # PDF::Reader.open("somefile.pdf") do |reader|
119
+ # puts reader.pdf_version
120
+ # end
121
+ #
122
+ def self.open(input, &block)
123
+ yield PDF::Reader.new(input)
124
+ end
125
+
126
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
127
+ # eventually be removed
128
+ #
129
+ #
130
+ # Parse the file with the given name, sending events to the given receiver.
131
+ #
132
+ def self.file(name, receivers, opts = {})
133
+ File.open(name,"rb") do |f|
134
+ new.parse(f, receivers, opts)
135
+ end
136
+ end
137
+
138
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
139
+ # eventually be removed
140
+ #
141
+ # Parse the given string, sending events to the given receiver.
142
+ #
143
+ def self.string(str, receivers, opts = {})
144
+ StringIO.open(str) do |s|
145
+ new.parse(s, receivers, opts)
146
+ end
147
+ end
148
+
149
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
150
+ # eventually be removed
151
+ #
152
+ # Parse the file with the given name, returning an unmarshalled ruby version of
153
+ # represents the requested pdf object
154
+ #
155
+ def self.object_file(name, id, gen = 0)
156
+ File.open(name,"rb") { |f|
157
+ new.object(f, id.to_i, gen.to_i)
158
+ }
159
+ end
160
+
161
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
162
+ # eventually be removed
163
+ #
164
+ # Parse the given string, returning an unmarshalled ruby version of represents
165
+ # the requested pdf object
166
+ #
167
+ def self.object_string(str, id, gen = 0)
168
+ StringIO.open(str) { |s|
169
+ new.object(s, id.to_i, gen.to_i)
170
+ }
171
+ end
172
+
173
+ # returns an array of PDF::Reader::Page objects, one for each
174
+ # page in the source PDF.
175
+ #
176
+ # reader = PDF::Reader.new("somefile.pdf")
177
+ #
178
+ # reader.pages.each do |page|
179
+ # puts page.fonts
180
+ # puts page.images
181
+ # puts page.text
182
+ # end
183
+ #
184
+ # See the docs for PDF::Reader::Page to read more about the
185
+ # methods available on each page
186
+ #
187
+ def pages
188
+ (1..@page_count).map { |num|
189
+ PDF::Reader::Page.new(@objects, num)
190
+ }
191
+ end
192
+
193
+ # returns a single PDF::Reader::Page for the specified page.
194
+ # Use this instead of pages method when you need to access just a single
195
+ # page
196
+ #
197
+ # reader = PDF::Reader.new("somefile.pdf")
198
+ # page = reader.page(10)
199
+ #
200
+ # puts page.text
201
+ #
202
+ # See the docs for PDF::Reader::Page to read more about the
203
+ # methods available on each page
204
+ #
205
+ def page(num)
206
+ num = num.to_i
207
+ raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
208
+ PDF::Reader::Page.new(@objects, num)
209
+ end
210
+
211
+
212
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
213
+ # eventually be removed
214
+ #
215
+ # Given an IO object that contains PDF data, parse it.
216
+ #
217
+ def parse(io, receivers, opts = {})
218
+ ohash = ObjectHash.new(io)
219
+
220
+ if ohash.trailer[:Encrypt]
221
+ raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
222
+ end
223
+
224
+ options = {:pages => true, :raw_text => false, :metadata => true}
225
+ options.merge!(opts)
226
+
227
+ strategies.each do |s|
228
+ s.new(ohash, receivers, options).process
229
+ end
230
+
231
+ self
232
+ end
233
+
234
+ # DEPRECATED: this method was deprecated in version 0.11.0 and will
235
+ # eventually be removed
236
+ #
237
+ # Given an IO object that contains PDF data, return the contents of a single object
238
+ #
239
+ def object (io, id, gen)
240
+ @objects = ObjectHash.new(io)
241
+
242
+ @objects.deref(Reference.new(id, gen))
243
+ end
244
+
245
+ private
246
+
247
+ def strategies
248
+ @strategies ||= [
249
+ ::PDF::Reader::MetadataStrategy,
250
+ ::PDF::Reader::PagesStrategy
251
+ ]
252
+ end
253
+
254
+ def root
255
+ root ||= @objects.deref(@objects.trailer[:Root])
256
+ end
257
+
258
+ def get_metadata
259
+ stream = @objects.deref(root[:Metadata])
260
+ stream ? stream.unfiltered_data : nil
261
+ end
262
+
263
+ def get_page_count
264
+ pages = @objects.deref(root[:Pages])
265
+ pages[:Count]
266
+ end
267
+
268
+ end
269
+ end
270
+ ################################################################################
271
+
272
+ require 'pdf/reader/abstract_strategy'
273
+ require 'pdf/reader/buffer'
274
+ require 'pdf/reader/cmap'
275
+ require 'pdf/reader/encoding'
276
+ require 'pdf/reader/error'
277
+ require 'pdf/reader/filter'
278
+ require 'pdf/reader/font'
279
+ require 'pdf/reader/form_xobject'
280
+ require 'pdf/reader/lzw'
281
+ require 'pdf/reader/metadata_strategy'
282
+ require 'pdf/reader/object_cache'
283
+ require 'pdf/reader/object_hash'
284
+ require 'pdf/reader/object_stream'
285
+ require 'pdf/reader/pages_strategy'
286
+ require 'pdf/reader/parser'
287
+ require 'pdf/reader/print_receiver'
288
+ require 'pdf/reader/reference'
289
+ require 'pdf/reader/register_receiver'
290
+ require 'pdf/reader/stream'
291
+ require 'pdf/reader/text_receiver'
292
+ require 'pdf/reader/page_text_receiver'
293
+ require 'pdf/reader/token'
294
+ require 'pdf/reader/xref'
295
+ require 'pdf/reader/page'
296
+ require 'pdf/hash'
data/lib/pdf-reader.rb ADDED
@@ -0,0 +1 @@
1
+ require "pdf/reader"