pdf-reader 0.11.0.alpha → 0.12.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/examples/text.rb CHANGED
@@ -6,35 +6,10 @@
6
6
  require 'rubygems'
7
7
  require 'pdf/reader'
8
8
 
9
- class PageTextReceiver
10
- attr_accessor :content
9
+ filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
11
10
 
12
- def initialize
13
- @content = []
14
- end
15
-
16
- # Called when page parsing starts
17
- def begin_page(arg = nil)
18
- @content << ""
19
- end
20
-
21
- # record text that is drawn on the page
22
- def show_text(string, *params)
23
- @content.last << string.strip
24
- end
25
-
26
- # there's a few text callbacks, so make sure we process them all
27
- alias :super_show_text :show_text
28
- alias :move_to_next_line_and_show_text :show_text
29
- alias :set_spacing_next_line_show_text :show_text
30
-
31
- # this final text callback takes slightly different arguments
32
- def show_text_with_positioning(*params)
33
- params = params.first
34
- params.each { |str| show_text(str) if str.kind_of?(String)}
11
+ PDF::Reader.open(filename) do |reader|
12
+ reader.pages.each do |page|
13
+ puts page.text
35
14
  end
36
15
  end
37
-
38
- receiver = PageTextReceiver.new
39
- pdf = PDF::Reader.file("somefile.pdf", receiver)
40
- puts receiver.content.inspect
data/examples/version.rb CHANGED
@@ -6,20 +6,8 @@
6
6
  require 'rubygems'
7
7
  require 'pdf/reader'
8
8
 
9
- class VersionReceiver
10
- attr_accessor :version
11
-
12
- def initialize
13
- @version = nil
14
- end
15
-
16
- # Called when document parsing starts
17
- def pdf_version(arg = nil)
18
- @version = arg
19
- end
9
+ filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
20
10
 
11
+ PDF::Reader.open(filename) do |reader|
12
+ puts reader.pdf_version
21
13
  end
22
-
23
- receiver = VersionReceiver.new
24
- pdf = PDF::Reader.file(ARGV.shift, receiver)
25
- puts receiver.version
data/lib/pdf/reader.rb CHANGED
@@ -84,13 +84,18 @@ module PDF
84
84
  # page = reader.page(1)
85
85
  # page.walk(receiver)
86
86
  #
87
+ # == Encrypted Files
88
+ #
89
+ # Depending on the algorithm it may be possible to parse an encrypted file.
90
+ # For standard PDF encryption you'll need the :password option
91
+ #
92
+ # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
93
+ #
87
94
  class Reader
88
95
 
89
96
  # lowlevel hash-like access to all objects in the underlying PDF
90
97
  attr_reader :objects
91
98
 
92
- attr_reader :page_count, :pdf_version, :info, :metadata
93
-
94
99
  # creates a new document reader for the provided PDF.
95
100
  #
96
101
  # input can be an IO-ish object (StringIO, File, etc) containing a PDF
@@ -102,16 +107,34 @@ module PDF
102
107
  # reader = PDF::Reader.new(file)
103
108
  # end
104
109
  #
105
- def initialize(input = nil)
110
+ # If the source file is encrypted you can provide a password for decrypting
111
+ #
112
+ # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
+ #
114
+ def initialize(input = nil, opts = {})
106
115
  if input # support the deprecated Reader API
107
- @objects = PDF::Reader::ObjectHash.new(input)
108
- @page_count = get_page_count
109
- @pdf_version = @objects.pdf_version
110
- @info = @objects.deref(@objects.trailer[:Info])
111
- @metadata = get_metadata
116
+ @objects = PDF::Reader::ObjectHash.new(input, opts)
112
117
  end
113
118
  end
114
119
 
120
+ def info
121
+ @objects.deref(@objects.trailer[:Info])
122
+ end
123
+
124
+ def metadata
125
+ stream = @objects.deref(root[:Metadata])
126
+ stream ? stream.unfiltered_data : nil
127
+ end
128
+
129
+ def page_count
130
+ pages = @objects.deref(root[:Pages])
131
+ @page_count ||= pages[:Count]
132
+ end
133
+
134
+ def pdf_version
135
+ @objects.pdf_version
136
+ end
137
+
115
138
  # syntactic sugar for opening a PDF file. Accepts the same arguments
116
139
  # as new().
117
140
  #
@@ -119,8 +142,14 @@ module PDF
119
142
  # puts reader.pdf_version
120
143
  # end
121
144
  #
122
- def self.open(input, &block)
123
- yield PDF::Reader.new(input)
145
+ # or
146
+ #
147
+ # PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
148
+ # puts reader.pdf_version
149
+ # end
150
+ #
151
+ def self.open(input, opts = {}, &block)
152
+ yield PDF::Reader.new(input, opts)
124
153
  end
125
154
 
126
155
  # DEPRECATED: this method was deprecated in version 0.11.0 and will
@@ -185,7 +214,7 @@ module PDF
185
214
  # methods available on each page
186
215
  #
187
216
  def pages
188
- (1..@page_count).map { |num|
217
+ (1..self.page_count).map { |num|
189
218
  PDF::Reader::Page.new(@objects, num)
190
219
  }
191
220
  end
@@ -204,7 +233,7 @@ module PDF
204
233
  #
205
234
  def page(num)
206
235
  num = num.to_i
207
- raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
236
+ raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
208
237
  PDF::Reader::Page.new(@objects, num)
209
238
  end
210
239
 
@@ -217,10 +246,6 @@ module PDF
217
246
  def parse(io, receivers, opts = {})
218
247
  ohash = ObjectHash.new(io)
219
248
 
220
- if ohash.trailer[:Encrypt]
221
- raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
222
- end
223
-
224
249
  options = {:pages => true, :raw_text => false, :metadata => true}
225
250
  options.merge!(opts)
226
251
 
@@ -252,17 +277,7 @@ module PDF
252
277
  end
253
278
 
254
279
  def root
255
- root ||= @objects.deref(@objects.trailer[:Root])
256
- end
257
-
258
- def get_metadata
259
- stream = @objects.deref(root[:Metadata])
260
- stream ? stream.unfiltered_data : nil
261
- end
262
-
263
- def get_page_count
264
- pages = @objects.deref(root[:Pages])
265
- pages[:Count]
280
+ @root ||= @objects.deref(@objects.trailer[:Root])
266
281
  end
267
282
 
268
283
  end
@@ -276,6 +291,8 @@ require 'pdf/reader/encoding'
276
291
  require 'pdf/reader/error'
277
292
  require 'pdf/reader/filter'
278
293
  require 'pdf/reader/font'
294
+ require 'pdf/reader/form_xobject'
295
+ require 'pdf/reader/glyph_hash'
279
296
  require 'pdf/reader/lzw'
280
297
  require 'pdf/reader/metadata_strategy'
281
298
  require 'pdf/reader/object_cache'
@@ -286,6 +303,7 @@ require 'pdf/reader/parser'
286
303
  require 'pdf/reader/print_receiver'
287
304
  require 'pdf/reader/reference'
288
305
  require 'pdf/reader/register_receiver'
306
+ require 'pdf/reader/standard_security_handler'
289
307
  require 'pdf/reader/stream'
290
308
  require 'pdf/reader/text_receiver'
291
309
  require 'pdf/reader/page_text_receiver'
@@ -97,7 +97,7 @@ class PDF::Reader
97
97
  }.map { |num|
98
98
  original_codepoint_to_unicode(num, tounicode)
99
99
  }.map { |c|
100
- glyphnames[c] || c
100
+ names_to_unicode[c] || c
101
101
  }.map { |c|
102
102
  if c.nil? || !c.is_a?(Fixnum)
103
103
  PDF::Reader::Encoding::UNKNOWN_CHAR
@@ -170,8 +170,8 @@ class PDF::Reader
170
170
  mapping.size > 0
171
171
  end
172
172
 
173
- def glyphnames
174
- @glyphnames ||= PDF::Reader::Font.glyphnames
173
+ def names_to_unicode
174
+ @names_to_unicode ||= PDF::Reader::GlyphHash.new
175
175
  end
176
176
 
177
177
  def load_mapping(file)
@@ -49,5 +49,6 @@ class PDF::Reader
49
49
  class MalformedPDFError < RuntimeError; end
50
50
  class InvalidObjectError < MalformedPDFError; end
51
51
  class UnsupportedFeatureError < RuntimeError; end
52
+ class EncryptedPDFError < UnsupportedFeatureError; end
52
53
  end
53
54
  ################################################################################
@@ -31,6 +31,7 @@ class PDF::Reader
31
31
  # content.
32
32
  #
33
33
  class Filter # :nodoc:
34
+
34
35
  ################################################################################
35
36
  # creates a new filter for decoding content.
36
37
  #
@@ -41,14 +42,16 @@ class PDF::Reader
41
42
  @options = options
42
43
 
43
44
  case name.to_sym
44
- when :ASCII85Decode then @filter = :ascii85
45
- when :ASCIIHexDecode then @filter = :asciihex
46
- when :CCITTFaxDecode then @filter = nil
47
- when :DCTDecode then @filter = nil
48
- when :FlateDecode then @filter = :flate
49
- when :JBIG2Decode then @filter = nil
50
- when :LZWDecode then @filter = :lzw
51
- else raise UnsupportedFeatureError, "Unknown filter: #{name}"
45
+ when :ASCII85Decode then @filter = :ascii85
46
+ when :ASCIIHexDecode then @filter = :asciihex
47
+ when :CCITTFaxDecode then @filter = nil
48
+ when :DCTDecode then @filter = nil
49
+ when :FlateDecode then @filter = :flate
50
+ when :JBIG2Decode then @filter = nil
51
+ when :LZWDecode then @filter = :lzw
52
+ when :RunLengthDecode then @filter = :runlength
53
+ else
54
+ raise UnsupportedFeatureError, "Unknown filter: #{name}"
52
55
  end
53
56
  end
54
57
  ################################################################################
@@ -117,6 +120,36 @@ class PDF::Reader
117
120
  depredict(data, @options)
118
121
  end
119
122
  ################################################################################
123
+ # Decode the specified data with the RunLengthDecode compression algorithm
124
+ def runlength(data)
125
+ pos = 0
126
+ out = ""
127
+
128
+ while pos < data.length
129
+ length = data.getbyte(pos)
130
+ pos += 1
131
+
132
+ case
133
+ when length == 128
134
+ break
135
+ when length < 128
136
+ # When the length is < 128, we copy the following length+1 bytes
137
+ # literally.
138
+ out << data[pos, length + 1]
139
+ pos += length
140
+ else
141
+ # When the length is > 128, we copy the next byte (257 - length)
142
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
143
+ # "\x00\x00\x00\x00\x00\x00\x00".
144
+ out << data[pos, 1] * (257 - length)
145
+ end
146
+
147
+ pos += 1
148
+ end
149
+
150
+ out
151
+ end
152
+ ################################################################################
120
153
  def depredict(data, opts = {})
121
154
  predictor = (opts || {})[:Predictor].to_i
122
155
 
@@ -133,7 +166,29 @@ class PDF::Reader
133
166
  end
134
167
  ################################################################################
135
168
  def tiff_depredict(data, opts = {})
136
- raise UnsupportedFeatureError, "TIFF predictor not supported"
169
+ data = data.unpack("C*")
170
+ unfiltered = []
171
+ bpc = opts[:BitsPerComponent] || 8
172
+ pixel_bits = bpc * opts[:Colors]
173
+ pixel_bytes = pixel_bits / 8
174
+ line_len = (pixel_bytes * opts[:Columns])
175
+ pos = 0
176
+
177
+ if bpc != 8
178
+ raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
179
+ end
180
+
181
+ until pos > data.size
182
+ row_data = data[pos, line_len]
183
+ row_data.each_with_index do |byte, index|
184
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
185
+ row_data[index] = (byte + left) % 256
186
+ end
187
+ unfiltered += row_data
188
+ pos += line_len
189
+ end
190
+
191
+ unfiltered.pack("C*")
137
192
  end
138
193
  ################################################################################
139
194
  def png_depredict(data, opts = {})
@@ -41,23 +41,6 @@ class PDF::Reader
41
41
  extract_descendants(obj)
42
42
  end
43
43
 
44
- # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
45
- # a text file supplied by Adobe at:
46
- # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
47
- def self.glyphnames
48
- glyphs = {}
49
-
50
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
51
- File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
52
- f.each do |l|
53
- m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
54
- glyphs[name.to_sym] = "0x#{code}".hex if name
55
- end
56
- end
57
-
58
- glyphs
59
- end
60
-
61
44
  def basefont=(font)
62
45
  # setup a default encoding for the selected font. It can always be overridden
63
46
  # with encoding= if required
@@ -0,0 +1,83 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # High level representation of a single PDF form xobject. Form xobjects
7
+ # are contained pieces of content that can be inserted onto multiple
8
+ # pages. They're generally used as a space efficient way to store
9
+ # repetative content (like logos, header, footers, etc).
10
+ #
11
+ # This behaves and looks much like a limited PDF::Reader::Page class.
12
+ #
13
+ class FormXObject
14
+
15
+ def initialize(page, xobject)
16
+ @page = page
17
+ @objects = page.objects
18
+ @xobject = @objects.deref(xobject)
19
+ end
20
+
21
+ # Returns the resources that accompany this form.
22
+ #
23
+ def resources
24
+ @resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
25
+ end
26
+
27
+ # return a hash of fonts used on this form.
28
+ #
29
+ # The keys are the font labels used within the form content stream.
30
+ #
31
+ # The values are a PDF::Reader::Font instances that provide access
32
+ # to most available metrics for each font.
33
+ #
34
+ def fonts
35
+ raw_fonts = @objects.deref(resources[:Font] || {})
36
+ ::Hash[raw_fonts.map { |label, font|
37
+ [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
38
+ }]
39
+ end
40
+
41
+ # processes the raw content stream for this form in sequential order and
42
+ # passes callbacks to the receiver objects.
43
+ #
44
+ # See the comments on PDF::Reader::Page#walk for more detail.
45
+ #
46
+ def walk(*receivers)
47
+ content_stream(receivers, raw_content)
48
+ end
49
+
50
+ # returns the raw content stream for this page. This is plumbing, nothing to
51
+ # see here unless you're a PDF nerd like me.
52
+ #
53
+ def raw_content
54
+ @xobject.unfiltered_data
55
+ end
56
+
57
+ private
58
+
59
+ def callback(receivers, name, params=[])
60
+ receivers.each do |receiver|
61
+ receiver.send(name, *params) if receiver.respond_to?(name)
62
+ end
63
+ end
64
+
65
+ def content_stream(receivers, instructions)
66
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
67
+ parser = Parser.new(buffer, @objects)
68
+ params = []
69
+
70
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
71
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
72
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
73
+ params.clear
74
+ else
75
+ params << token
76
+ end
77
+ end
78
+ rescue EOFError => e
79
+ raise MalformedPDFError, "End Of File while processing a content stream"
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,88 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2011 James Healy (jimmy@deefa.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ class GlyphHash # :nodoc:
28
+ def initialize
29
+ @adobe = load_adobe_glyph_mapping
30
+ end
31
+
32
+ # attempt to convert a PDF Name to a unicode codepoint. Returns nil
33
+ # if no conversion is possible.
34
+ #
35
+ # h = GlyphHash.new
36
+ #
37
+ # h[:A]
38
+ # => 65
39
+ #
40
+ # h[:Euro]
41
+ # => 8364
42
+ #
43
+ # h[:G30]
44
+ # => 48
45
+ #
46
+ # h[:34]
47
+ #
48
+ def [](name)
49
+ return nil unless name.is_a?(Symbol)
50
+
51
+ str = name.to_s
52
+
53
+ if @adobe.has_key?(name)
54
+ @adobe[name]
55
+ elsif str.match(/\Auni[A-F\d]{4}\Z/)
56
+ "0x#{str[3,4]}".hex
57
+ elsif str.match(/\Au[A-F\d]{4,6}\Z/)
58
+ "0x#{str[1,6]}".hex
59
+ elsif str.match(/\A[A-Za-z]\d{2,4}\Z/)
60
+ str[1,4].to_i
61
+ elsif str.match(/\A[A-Za-z]{2}\d{2,4}\Z/)
62
+ str[2,4].to_i
63
+ else
64
+ nil
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
71
+ # a text file supplied by Adobe at:
72
+ # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
73
+ def load_adobe_glyph_mapping
74
+ glyphs = {}
75
+
76
+ RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
77
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
78
+ f.each do |l|
79
+ m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
80
+ glyphs[name.to_sym] = "0x#{code}".hex if name
81
+ end
82
+ end
83
+
84
+ glyphs
85
+ end
86
+
87
+ end
88
+ end