fireinc-pdf-reader 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # coding: utf-8
4
+ # Extract metadata only
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class MetaDataReceiver
10
+ attr_accessor :regular
11
+ attr_accessor :xml
12
+
13
+ def metadata(data)
14
+ @regular = data
15
+ end
16
+
17
+ def metadata_xml(data)
18
+ @xml = data
19
+ end
20
+ end
21
+
22
+ receiver = MetaDataReceiver.new
23
+ pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
24
+ puts receiver.regular.inspect
25
+ puts receiver.xml.inspect
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Improved Page Counter
5
+ #
6
+ # A simple app to display the number of pages in a PDF File.
7
+ #
8
+
9
+ require 'rubygems'
10
+ require 'pdf/reader'
11
+
12
+ class PageReceiver
13
+ attr_accessor :pages
14
+
15
+ # Called when page parsing ends
16
+ def page_count(arg)
17
+ @pages = arg
18
+ end
19
+ end
20
+
21
+ receiver = PageReceiver.new
22
+ pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
23
+ puts "#{receiver.pages} pages"
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # A simple app to count the number of pages in a PDF File.
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class PageReceiver
10
+ attr_accessor :counter
11
+
12
+ def initialize
13
+ @counter = 0
14
+ end
15
+
16
+ # Called when page parsing ends
17
+ def end_page
18
+ @counter += 1
19
+ end
20
+ end
21
+
22
+ receiver = PageReceiver.new
23
+ pdf = PDF::Reader.file("somefile.pdf", receiver)
24
+ puts "#{receiver.counter} pages"
data/examples/rspec.rb ADDED
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Basic RSpec of a generated PDF
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+ require 'pdf/writer'
9
+ require 'spec'
10
+
11
+ class PageTextReceiver
12
+ attr_accessor :content
13
+
14
+ def initialize
15
+ @content = []
16
+ end
17
+
18
+ # Called when page parsing starts
19
+ def begin_page(arg = nil)
20
+ @content << ""
21
+ end
22
+
23
+ def show_text(string, *params)
24
+ @content.last << string.strip
25
+ end
26
+
27
+ # there's a few text callbacks, so make sure we process them all
28
+ alias :super_show_text :show_text
29
+ alias :move_to_next_line_and_show_text :show_text
30
+ alias :set_spacing_next_line_show_text :show_text
31
+
32
+ def show_text_with_positioning(*params)
33
+ params = params.first
34
+ params.each { |str| show_text(str) if str.kind_of?(String)}
35
+ end
36
+ end
37
+
38
+ context "My generated PDF" do
39
+ specify "should have the correct text on 2 pages" do
40
+
41
+ # generate our PDF
42
+ pdf = PDF::Writer.new
43
+ pdf.text "Chunky", :font_size => 32, :justification => :center
44
+ pdf.start_new_page
45
+ pdf.text "Bacon", :font_size => 32, :justification => :center
46
+ pdf.save_as("chunkybacon.pdf")
47
+
48
+ # process the PDF
49
+ receiver = PageTextReceiver.new
50
+ PDF::Reader.file("chunkybacon.pdf", receiver)
51
+
52
+ # confirm the text appears on the correct pages
53
+ receiver.content.size.should eql(2)
54
+ receiver.content[0].should eql("Chunky")
55
+ receiver.content[1].should eql("Bacon")
56
+ end
57
+ end
data/examples/text.rb ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Extract all text from a single PDF
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class PageTextReceiver
10
+ attr_accessor :content
11
+
12
+ def initialize
13
+ @content = []
14
+ end
15
+
16
+ # Called when page parsing starts
17
+ def begin_page(arg = nil)
18
+ @content << ""
19
+ end
20
+
21
+ # record text that is drawn on the page
22
+ def show_text(string, *params)
23
+ @content.last << string.strip
24
+ end
25
+
26
+ # there's a few text callbacks, so make sure we process them all
27
+ alias :super_show_text :show_text
28
+ alias :move_to_next_line_and_show_text :show_text
29
+ alias :set_spacing_next_line_show_text :show_text
30
+
31
+ # this final text callback takes slightly different arguments
32
+ def show_text_with_positioning(*params)
33
+ params = params.first
34
+ params.each { |str| show_text(str) if str.kind_of?(String)}
35
+ end
36
+ end
37
+
38
+ receiver = PageTextReceiver.new
39
+ pdf = PDF::Reader.file("somefile.pdf", receiver)
40
+ puts receiver.content.inspect
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Determine the PDF version of a file
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class VersionReceiver
10
+ attr_accessor :version
11
+
12
+ def initialize
13
+ @version = nil
14
+ end
15
+
16
+ # Called when document parsing starts
17
+ def pdf_version(arg = nil)
18
+ @version = arg
19
+ end
20
+
21
+ end
22
+
23
+ receiver = VersionReceiver.new
24
+ pdf = PDF::Reader.file(ARGV.shift, receiver)
25
+ puts receiver.version
data/lib/pdf/hash.rb ADDED
@@ -0,0 +1,15 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Hash < ::PDF::Reader::ObjectHash # :nodoc:
5
+ def initialize(input)
6
+ warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
7
+ super
8
+ end
9
+
10
+ def version
11
+ warn "DEPRECATION NOTICE: PDF::Hash#version has been deprecated, use PDF::Reader::ObjectHash#pdf_version instead"
12
+ pdf_version
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,81 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
7
+ class AbstractStrategy # :nodoc:
8
+
9
+ def initialize(ohash, receivers, options = {})
10
+ @ohash, @options = ohash, options
11
+ if receivers.is_a?(Array)
12
+ @receivers = receivers
13
+ else
14
+ @receivers = [receivers]
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def options
21
+ @options || {}
22
+ end
23
+
24
+ # calls the name callback method on the receiver class with params as the arguments
25
+ #
26
+ def callback (name, params=[])
27
+ @receivers.each do |receiver|
28
+ receiver.send(name, *params) if receiver.respond_to?(name)
29
+ end
30
+ end
31
+
32
+ # strings outside of page content should be in either PDFDocEncoding or UTF-16.
33
+ def decode_strings(obj)
34
+ case obj
35
+ when String then
36
+ if obj[0,2].unpack("C*").slice(0,2) == [254,255]
37
+ PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
38
+ else
39
+ PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
40
+ end
41
+ when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
42
+ when Array then obj.collect { |item| decode_strings(item) }
43
+ else
44
+ obj
45
+ end
46
+ end
47
+
48
+ def info
49
+ ohash.object(trailer[:Info])
50
+ end
51
+
52
+ def info?
53
+ info ? true : false
54
+ end
55
+
56
+ def ohash
57
+ @ohash
58
+ end
59
+
60
+ def pages
61
+ ohash.object(root[:Pages])
62
+ end
63
+
64
+ def pages?
65
+ pages ? true : false
66
+ end
67
+
68
+ def root
69
+ ohash.object(trailer[:Root])
70
+ end
71
+
72
+ def root?
73
+ root ? true : false
74
+ end
75
+
76
+ def trailer
77
+ ohash.trailer
78
+ end
79
+
80
+ end
81
+ end
@@ -0,0 +1,346 @@
1
+ # coding: utf-8
2
+
3
+ ################################################################################
4
+ #
5
+ # Copyright (C) 2010 James Healy (jimmy@deefa.com)
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining
8
+ # a copy of this software and associated documentation files (the
9
+ # "Software"), to deal in the Software without restriction, including
10
+ # without limitation the rights to use, copy, modify, merge, publish,
11
+ # distribute, sublicense, and/or sell copies of the Software, and to
12
+ # permit persons to whom the Software is furnished to do so, subject to
13
+ # the following conditions:
14
+ #
15
+ # The above copyright notice and this permission notice shall be
16
+ # included in all copies or substantial portions of the Software.
17
+ #
18
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
+ #
26
+ ################################################################################
27
+
28
+ class PDF::Reader
29
+
30
+ # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
31
+ # string, repeated calls to token() will return the next token from the source.
32
+ #
33
+ # This is very low level, and getting the raw tokens is not very useful in itself.
34
+ #
35
+ # This will usually be used in conjunction with PDF:Reader::Parser, which converts
36
+ # the raw tokens into objects we can work with (strings, ints, arrays, etc)
37
+ #
38
+ class Buffer
39
+
40
+ attr_reader :pos
41
+
42
+ # Creates a new buffer.
43
+ #
44
+ # Params:
45
+ #
46
+ # io - an IO stream or string with the raw data to tokenise
47
+ #
48
+ # options:
49
+ #
50
+ # :seek - a byte offset to seek to before starting to tokenise
51
+ # :content_stream - set to true if buffer will be tokenising a
52
+ # content stream. Defaults to false
53
+ #
54
+ def initialize (io, opts = {})
55
+ @io = io
56
+ @tokens = []
57
+ @in_content_stream = opts[:content_stream]
58
+
59
+ @io.seek(opts[:seek]) if opts[:seek]
60
+ @pos = @io.pos
61
+ end
62
+
63
+ # return true if there are no more tokens left
64
+ #
65
+ def empty?
66
+ prepare_tokens if @tokens.size < 3
67
+
68
+ @tokens.empty?
69
+ end
70
+
71
+ # return raw bytes from the underlying IO stream.
72
+ #
73
+ # bytes - the number of bytes to read
74
+ #
75
+ # options:
76
+ #
77
+ # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
78
+ # is sitting under the io cursor.
79
+ #
80
+ def read(bytes, opts = {})
81
+ reset_pos
82
+
83
+ if opts[:skip_eol]
84
+ @io.seek(-1, IO::SEEK_CUR)
85
+ str = @io.read(2)
86
+ if str.nil?
87
+ return nil
88
+ elsif str == "\r\n"
89
+ # do nothing
90
+ elsif str[0,1] == "\n"
91
+ @io.seek(-1, IO::SEEK_CUR)
92
+ else
93
+ @io.seek(-2, IO::SEEK_CUR)
94
+ end
95
+ end
96
+
97
+ bytes = @io.read(bytes)
98
+ save_pos
99
+ bytes
100
+ end
101
+
102
+ # return the next token from the source. Returns a string if a token
103
+ # is found, nil if there are no tokens left.
104
+ #
105
+ def token
106
+ reset_pos
107
+ prepare_tokens if @tokens.size < 3
108
+ merge_indirect_reference
109
+ prepare_tokens if @tokens.size < 3
110
+
111
+ @tokens.shift
112
+ end
113
+
114
+ # return the byte offset where the first XRef table in th source can be found.
115
+ #
116
+ def find_first_xref_offset
117
+ @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
118
+ data = @io.read(1024)
119
+
120
+ # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
121
+ lines = data.split(/[\n\r]+/).reverse
122
+ eof_index = lines.index { |l| l.strip == "%%EOF" }
123
+
124
+ raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
125
+ raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
126
+ lines[eof_index+1].to_i
127
+ end
128
+
129
+ private
130
+
131
+ # Returns true if this buffer is parsing a content stream
132
+ #
133
+ def in_content_stream?
134
+ @in_content_stream ? true : false
135
+ end
136
+
137
+ # Some bastard moved our IO stream cursor. Restore it.
138
+ #
139
+ def reset_pos
140
+ @io.seek(@pos) if @io.pos != @pos
141
+ end
142
+
143
+ # save the current position of the source IO stream. If someone else (like another buffer)
144
+ # moves the cursor, we can then restore it.
145
+ #
146
+ def save_pos
147
+ @pos = @io.pos
148
+ end
149
+
150
+ # attempt to prime the buffer with the next few tokens.
151
+ #
152
+ def prepare_tokens
153
+ 10.times do
154
+ if state == :literal_string
155
+ prepare_literal_token
156
+ elsif state == :hex_string
157
+ prepare_hex_token
158
+ elsif state == :regular
159
+ prepare_regular_token
160
+ elsif state == :inline
161
+ prepare_inline_token
162
+ end
163
+ end
164
+
165
+ save_pos
166
+ end
167
+
168
+ # tokenising behaves slightly differently based on the current context.
169
+ # Determine the current context/state by examining the last token we found
170
+ #
171
+ def state
172
+ if @tokens[-1] == "("
173
+ :literal_string
174
+ elsif @tokens[-1] == "<"
175
+ :hex_string
176
+ elsif @tokens[-1] == "stream"
177
+ :stream
178
+ elsif in_content_stream? && @tokens[-1] == "ID"
179
+ :inline
180
+ else
181
+ :regular
182
+ end
183
+ end
184
+
185
+ # detect a series of 3 tokens that make up an indirect object. If we find
186
+ # them, replace the tokens with a PDF::Reader::Reference instance.
187
+ #
188
+ # Merging them into a single string was another option, but that would mean
189
+ # code further up the stack would need to check every token to see if it looks
190
+ # like an indirect object. For optimisation reasons, I'd rather avoid
191
+ # that extra check.
192
+ #
193
+ # It's incredibly likely that the next 3 tokens in the buffer are NOT an
194
+ # indirect reference, so test for that case first and avoid the relatively
195
+ # expensive regexp checks if possible.
196
+ #
197
+ def merge_indirect_reference
198
+ return if @tokens.size < 3
199
+ return if @tokens[2] != "R"
200
+
201
+ if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
202
+ @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
203
+ @tokens[1] = nil
204
+ @tokens[2] = nil
205
+ @tokens.compact!
206
+ end
207
+ end
208
+
209
+ def prepare_inline_token
210
+ str = ""
211
+
212
+ while str[-2,2] != "EI"
213
+ chr = @io.read(1)
214
+ break if chr.nil?
215
+ str << chr
216
+ end
217
+
218
+ @tokens << str[0, str.size-2].strip
219
+ @io.seek(-2, IO::SEEK_CUR) unless chr.nil?
220
+ end
221
+
222
+ # if we're currently inside a hex string, read hex nibbles until
223
+ # we find a closing >
224
+ #
225
+ def prepare_hex_token
226
+ str = ""
227
+ finished = false
228
+
229
+ while !finished
230
+ chr = @io.read(1)
231
+ codepoint = chr.to_s.unpack("C*").first
232
+ if chr.nil?
233
+ finished = true # unbalanced params
234
+ elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
235
+ str << chr
236
+ elsif codepoint <= 32
237
+ # ignore it
238
+ else
239
+ @tokens << str if str.size > 0
240
+ @tokens << ">" if chr != ">"
241
+ @tokens << chr
242
+ finished = true
243
+ end
244
+ end
245
+ end
246
+
247
+ # if we're currently inside a literal string we more or less just read bytes until
248
+ # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
249
+ # start of a new token in regular mode are left untouched when inside a literal
250
+ # string.
251
+ #
252
+ # The entire literal string will be returned as a single token. It will need further
253
+ # processing to fix things like escaped new lines, but that's someone else's
254
+ # problem.
255
+ #
256
+ def prepare_literal_token
257
+ str = ""
258
+ count = 1
259
+
260
+ while count > 0
261
+ chr = @io.read(1)
262
+ if chr.nil?
263
+ count = 0 # unbalanced params
264
+ elsif chr == "\x5c"
265
+ str << chr << @io.read(1).to_s
266
+ elsif chr == "("
267
+ str << "("
268
+ count += 1
269
+ elsif chr == ")"
270
+ count -= 1
271
+ str << ")" unless count == 0
272
+ else
273
+ str << chr unless count == 0
274
+ end
275
+ end
276
+
277
+ @tokens << str if str.size > 0
278
+ @tokens << ")"
279
+ end
280
+
281
+ # Extract the next regular token and stock it in our buffer, ready to be returned.
282
+ #
283
+ # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
284
+ # to read up on it.
285
+ #
286
+ def prepare_regular_token
287
+ tok = ""
288
+
289
+ while chr = @io.read(1)
290
+ case chr
291
+ when "\x25"
292
+ # comment, ignore everything until the next EOL char
293
+ done = false
294
+ while !done
295
+ chr = @io.read(1)
296
+ done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
297
+ end
298
+ when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
299
+ # white space, token finished
300
+ @tokens << tok if tok.size > 0
301
+ tok = ""
302
+ break
303
+ when "\x3C"
304
+ # opening delimiter '<', start of new token
305
+ @tokens << tok if tok.size > 0
306
+ chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
307
+ @tokens << chr
308
+ tok = ""
309
+ break
310
+ when "\x3E"
311
+ # closing delimiter '>', start of new token
312
+ @tokens << tok if tok.size > 0
313
+ chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
314
+ @tokens << chr
315
+ tok = ""
316
+ break
317
+ when "\x28", "\x5B", "\x7B", "\x2F"
318
+ # opening delimiter, start of new token
319
+ @tokens << tok if tok.size > 0
320
+ @tokens << chr
321
+ tok = ""
322
+ break
323
+ when "\x29", "\x5D", "\x7D"
324
+ # closing delimiter
325
+ @tokens << tok if tok.size > 0
326
+ @tokens << chr
327
+ tok = ""
328
+ break
329
+ else
330
+ tok << chr
331
+ end
332
+ end
333
+
334
+ @tokens << tok if tok.size > 0
335
+ end
336
+
337
+ # peek at the next character in the io stream, leaving the stream position
338
+ # untouched
339
+ #
340
+ def peek_char
341
+ chr = @io.read(1)
342
+ @io.seek(-1, IO::SEEK_CUR) unless chr.nil?
343
+ chr
344
+ end
345
+ end
346
+ end