pdf-reader 0.11.0.alpha → 0.12.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
data/examples/text.rb CHANGED
@@ -6,35 +6,10 @@
6
6
  require 'rubygems'
7
7
  require 'pdf/reader'
8
8
 
9
- class PageTextReceiver
10
- attr_accessor :content
9
+ filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
11
10
 
12
- def initialize
13
- @content = []
14
- end
15
-
16
- # Called when page parsing starts
17
- def begin_page(arg = nil)
18
- @content << ""
19
- end
20
-
21
- # record text that is drawn on the page
22
- def show_text(string, *params)
23
- @content.last << string.strip
24
- end
25
-
26
- # there's a few text callbacks, so make sure we process them all
27
- alias :super_show_text :show_text
28
- alias :move_to_next_line_and_show_text :show_text
29
- alias :set_spacing_next_line_show_text :show_text
30
-
31
- # this final text callback takes slightly different arguments
32
- def show_text_with_positioning(*params)
33
- params = params.first
34
- params.each { |str| show_text(str) if str.kind_of?(String)}
11
+ PDF::Reader.open(filename) do |reader|
12
+ reader.pages.each do |page|
13
+ puts page.text
35
14
  end
36
15
  end
37
-
38
- receiver = PageTextReceiver.new
39
- pdf = PDF::Reader.file("somefile.pdf", receiver)
40
- puts receiver.content.inspect
data/examples/version.rb CHANGED
@@ -6,20 +6,8 @@
6
6
  require 'rubygems'
7
7
  require 'pdf/reader'
8
8
 
9
- class VersionReceiver
10
- attr_accessor :version
11
-
12
- def initialize
13
- @version = nil
14
- end
15
-
16
- # Called when document parsing starts
17
- def pdf_version(arg = nil)
18
- @version = arg
19
- end
9
+ filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
20
10
 
11
+ PDF::Reader.open(filename) do |reader|
12
+ puts reader.pdf_version
21
13
  end
22
-
23
- receiver = VersionReceiver.new
24
- pdf = PDF::Reader.file(ARGV.shift, receiver)
25
- puts receiver.version
data/lib/pdf/reader.rb CHANGED
@@ -84,13 +84,18 @@ module PDF
84
84
  # page = reader.page(1)
85
85
  # page.walk(receiver)
86
86
  #
87
+ # == Encrypted Files
88
+ #
89
+ # Depending on the algorithm it may be possible to parse an encrypted file.
90
+ # For standard PDF encryption you'll need the :password option
91
+ #
92
+ # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
93
+ #
87
94
  class Reader
88
95
 
89
96
  # lowlevel hash-like access to all objects in the underlying PDF
90
97
  attr_reader :objects
91
98
 
92
- attr_reader :page_count, :pdf_version, :info, :metadata
93
-
94
99
  # creates a new document reader for the provided PDF.
95
100
  #
96
101
  # input can be an IO-ish object (StringIO, File, etc) containing a PDF
@@ -102,16 +107,34 @@ module PDF
102
107
  # reader = PDF::Reader.new(file)
103
108
  # end
104
109
  #
105
- def initialize(input = nil)
110
+ # If the source file is encrypted you can provide a password for decrypting
111
+ #
112
+ # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
+ #
114
+ def initialize(input = nil, opts = {})
106
115
  if input # support the deprecated Reader API
107
- @objects = PDF::Reader::ObjectHash.new(input)
108
- @page_count = get_page_count
109
- @pdf_version = @objects.pdf_version
110
- @info = @objects.deref(@objects.trailer[:Info])
111
- @metadata = get_metadata
116
+ @objects = PDF::Reader::ObjectHash.new(input, opts)
112
117
  end
113
118
  end
114
119
 
120
+ def info
121
+ @objects.deref(@objects.trailer[:Info])
122
+ end
123
+
124
+ def metadata
125
+ stream = @objects.deref(root[:Metadata])
126
+ stream ? stream.unfiltered_data : nil
127
+ end
128
+
129
+ def page_count
130
+ pages = @objects.deref(root[:Pages])
131
+ @page_count ||= pages[:Count]
132
+ end
133
+
134
+ def pdf_version
135
+ @objects.pdf_version
136
+ end
137
+
115
138
  # syntactic sugar for opening a PDF file. Accepts the same arguments
116
139
  # as new().
117
140
  #
@@ -119,8 +142,14 @@ module PDF
119
142
  # puts reader.pdf_version
120
143
  # end
121
144
  #
122
- def self.open(input, &block)
123
- yield PDF::Reader.new(input)
145
+ # or
146
+ #
147
+ # PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
148
+ # puts reader.pdf_version
149
+ # end
150
+ #
151
+ def self.open(input, opts = {}, &block)
152
+ yield PDF::Reader.new(input, opts)
124
153
  end
125
154
 
126
155
  # DEPRECATED: this method was deprecated in version 0.11.0 and will
@@ -185,7 +214,7 @@ module PDF
185
214
  # methods available on each page
186
215
  #
187
216
  def pages
188
- (1..@page_count).map { |num|
217
+ (1..self.page_count).map { |num|
189
218
  PDF::Reader::Page.new(@objects, num)
190
219
  }
191
220
  end
@@ -204,7 +233,7 @@ module PDF
204
233
  #
205
234
  def page(num)
206
235
  num = num.to_i
207
- raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
236
+ raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
208
237
  PDF::Reader::Page.new(@objects, num)
209
238
  end
210
239
 
@@ -217,10 +246,6 @@ module PDF
217
246
  def parse(io, receivers, opts = {})
218
247
  ohash = ObjectHash.new(io)
219
248
 
220
- if ohash.trailer[:Encrypt]
221
- raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
222
- end
223
-
224
249
  options = {:pages => true, :raw_text => false, :metadata => true}
225
250
  options.merge!(opts)
226
251
 
@@ -252,17 +277,7 @@ module PDF
252
277
  end
253
278
 
254
279
  def root
255
- root ||= @objects.deref(@objects.trailer[:Root])
256
- end
257
-
258
- def get_metadata
259
- stream = @objects.deref(root[:Metadata])
260
- stream ? stream.unfiltered_data : nil
261
- end
262
-
263
- def get_page_count
264
- pages = @objects.deref(root[:Pages])
265
- pages[:Count]
280
+ @root ||= @objects.deref(@objects.trailer[:Root])
266
281
  end
267
282
 
268
283
  end
@@ -276,6 +291,8 @@ require 'pdf/reader/encoding'
276
291
  require 'pdf/reader/error'
277
292
  require 'pdf/reader/filter'
278
293
  require 'pdf/reader/font'
294
+ require 'pdf/reader/form_xobject'
295
+ require 'pdf/reader/glyph_hash'
279
296
  require 'pdf/reader/lzw'
280
297
  require 'pdf/reader/metadata_strategy'
281
298
  require 'pdf/reader/object_cache'
@@ -286,6 +303,7 @@ require 'pdf/reader/parser'
286
303
  require 'pdf/reader/print_receiver'
287
304
  require 'pdf/reader/reference'
288
305
  require 'pdf/reader/register_receiver'
306
+ require 'pdf/reader/standard_security_handler'
289
307
  require 'pdf/reader/stream'
290
308
  require 'pdf/reader/text_receiver'
291
309
  require 'pdf/reader/page_text_receiver'
@@ -97,7 +97,7 @@ class PDF::Reader
97
97
  }.map { |num|
98
98
  original_codepoint_to_unicode(num, tounicode)
99
99
  }.map { |c|
100
- glyphnames[c] || c
100
+ names_to_unicode[c] || c
101
101
  }.map { |c|
102
102
  if c.nil? || !c.is_a?(Fixnum)
103
103
  PDF::Reader::Encoding::UNKNOWN_CHAR
@@ -170,8 +170,8 @@ class PDF::Reader
170
170
  mapping.size > 0
171
171
  end
172
172
 
173
- def glyphnames
174
- @glyphnames ||= PDF::Reader::Font.glyphnames
173
+ def names_to_unicode
174
+ @names_to_unicode ||= PDF::Reader::GlyphHash.new
175
175
  end
176
176
 
177
177
  def load_mapping(file)
@@ -49,5 +49,6 @@ class PDF::Reader
49
49
  class MalformedPDFError < RuntimeError; end
50
50
  class InvalidObjectError < MalformedPDFError; end
51
51
  class UnsupportedFeatureError < RuntimeError; end
52
+ class EncryptedPDFError < UnsupportedFeatureError; end
52
53
  end
53
54
  ################################################################################
@@ -31,6 +31,7 @@ class PDF::Reader
31
31
  # content.
32
32
  #
33
33
  class Filter # :nodoc:
34
+
34
35
  ################################################################################
35
36
  # creates a new filter for decoding content.
36
37
  #
@@ -41,14 +42,16 @@ class PDF::Reader
41
42
  @options = options
42
43
 
43
44
  case name.to_sym
44
- when :ASCII85Decode then @filter = :ascii85
45
- when :ASCIIHexDecode then @filter = :asciihex
46
- when :CCITTFaxDecode then @filter = nil
47
- when :DCTDecode then @filter = nil
48
- when :FlateDecode then @filter = :flate
49
- when :JBIG2Decode then @filter = nil
50
- when :LZWDecode then @filter = :lzw
51
- else raise UnsupportedFeatureError, "Unknown filter: #{name}"
45
+ when :ASCII85Decode then @filter = :ascii85
46
+ when :ASCIIHexDecode then @filter = :asciihex
47
+ when :CCITTFaxDecode then @filter = nil
48
+ when :DCTDecode then @filter = nil
49
+ when :FlateDecode then @filter = :flate
50
+ when :JBIG2Decode then @filter = nil
51
+ when :LZWDecode then @filter = :lzw
52
+ when :RunLengthDecode then @filter = :runlength
53
+ else
54
+ raise UnsupportedFeatureError, "Unknown filter: #{name}"
52
55
  end
53
56
  end
54
57
  ################################################################################
@@ -117,6 +120,36 @@ class PDF::Reader
117
120
  depredict(data, @options)
118
121
  end
119
122
  ################################################################################
123
+ # Decode the specified data with the RunLengthDecode compression algorithm
124
+ def runlength(data)
125
+ pos = 0
126
+ out = ""
127
+
128
+ while pos < data.length
129
+ length = data.getbyte(pos)
130
+ pos += 1
131
+
132
+ case
133
+ when length == 128
134
+ break
135
+ when length < 128
136
+ # When the length is < 128, we copy the following length+1 bytes
137
+ # literally.
138
+ out << data[pos, length + 1]
139
+ pos += length
140
+ else
141
+ # When the length is > 128, we copy the next byte (257 - length)
142
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
143
+ # "\x00\x00\x00\x00\x00\x00\x00".
144
+ out << data[pos, 1] * (257 - length)
145
+ end
146
+
147
+ pos += 1
148
+ end
149
+
150
+ out
151
+ end
152
+ ################################################################################
120
153
  def depredict(data, opts = {})
121
154
  predictor = (opts || {})[:Predictor].to_i
122
155
 
@@ -133,7 +166,29 @@ class PDF::Reader
133
166
  end
134
167
  ################################################################################
135
168
  def tiff_depredict(data, opts = {})
136
- raise UnsupportedFeatureError, "TIFF predictor not supported"
169
+ data = data.unpack("C*")
170
+ unfiltered = []
171
+ bpc = opts[:BitsPerComponent] || 8
172
+ pixel_bits = bpc * opts[:Colors]
173
+ pixel_bytes = pixel_bits / 8
174
+ line_len = (pixel_bytes * opts[:Columns])
175
+ pos = 0
176
+
177
+ if bpc != 8
178
+ raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
179
+ end
180
+
181
+ until pos > data.size
182
+ row_data = data[pos, line_len]
183
+ row_data.each_with_index do |byte, index|
184
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
185
+ row_data[index] = (byte + left) % 256
186
+ end
187
+ unfiltered += row_data
188
+ pos += line_len
189
+ end
190
+
191
+ unfiltered.pack("C*")
137
192
  end
138
193
  ################################################################################
139
194
  def png_depredict(data, opts = {})
@@ -41,23 +41,6 @@ class PDF::Reader
41
41
  extract_descendants(obj)
42
42
  end
43
43
 
44
- # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
45
- # a text file supplied by Adobe at:
46
- # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
47
- def self.glyphnames
48
- glyphs = {}
49
-
50
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
51
- File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
52
- f.each do |l|
53
- m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
54
- glyphs[name.to_sym] = "0x#{code}".hex if name
55
- end
56
- end
57
-
58
- glyphs
59
- end
60
-
61
44
  def basefont=(font)
62
45
  # setup a default encoding for the selected font. It can always be overridden
63
46
  # with encoding= if required
@@ -0,0 +1,83 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # High level representation of a single PDF form xobject. Form xobjects
7
+ # are contained pieces of content that can be inserted onto multiple
8
+ # pages. They're generally used as a space efficient way to store
9
+ # repetative content (like logos, header, footers, etc).
10
+ #
11
+ # This behaves and looks much like a limited PDF::Reader::Page class.
12
+ #
13
+ class FormXObject
14
+
15
+ def initialize(page, xobject)
16
+ @page = page
17
+ @objects = page.objects
18
+ @xobject = @objects.deref(xobject)
19
+ end
20
+
21
+ # Returns the resources that accompany this form.
22
+ #
23
+ def resources
24
+ @resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
25
+ end
26
+
27
+ # return a hash of fonts used on this form.
28
+ #
29
+ # The keys are the font labels used within the form content stream.
30
+ #
31
+ # The values are a PDF::Reader::Font instances that provide access
32
+ # to most available metrics for each font.
33
+ #
34
+ def fonts
35
+ raw_fonts = @objects.deref(resources[:Font] || {})
36
+ ::Hash[raw_fonts.map { |label, font|
37
+ [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
38
+ }]
39
+ end
40
+
41
+ # processes the raw content stream for this form in sequential order and
42
+ # passes callbacks to the receiver objects.
43
+ #
44
+ # See the comments on PDF::Reader::Page#walk for more detail.
45
+ #
46
+ def walk(*receivers)
47
+ content_stream(receivers, raw_content)
48
+ end
49
+
50
+ # returns the raw content stream for this page. This is plumbing, nothing to
51
+ # see here unless you're a PDF nerd like me.
52
+ #
53
+ def raw_content
54
+ @xobject.unfiltered_data
55
+ end
56
+
57
+ private
58
+
59
+ def callback(receivers, name, params=[])
60
+ receivers.each do |receiver|
61
+ receiver.send(name, *params) if receiver.respond_to?(name)
62
+ end
63
+ end
64
+
65
+ def content_stream(receivers, instructions)
66
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
67
+ parser = Parser.new(buffer, @objects)
68
+ params = []
69
+
70
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
71
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
72
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
73
+ params.clear
74
+ else
75
+ params << token
76
+ end
77
+ end
78
+ rescue EOFError => e
79
+ raise MalformedPDFError, "End Of File while processing a content stream"
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,88 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2011 James Healy (jimmy@deefa.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ class GlyphHash # :nodoc:
28
+ def initialize
29
+ @adobe = load_adobe_glyph_mapping
30
+ end
31
+
32
+ # attempt to convert a PDF Name to a unicode codepoint. Returns nil
33
+ # if no conversion is possible.
34
+ #
35
+ # h = GlyphHash.new
36
+ #
37
+ # h[:A]
38
+ # => 65
39
+ #
40
+ # h[:Euro]
41
+ # => 8364
42
+ #
43
+ # h[:G30]
44
+ # => 48
45
+ #
46
+ # h[:34]
47
+ #
48
+ def [](name)
49
+ return nil unless name.is_a?(Symbol)
50
+
51
+ str = name.to_s
52
+
53
+ if @adobe.has_key?(name)
54
+ @adobe[name]
55
+ elsif str.match(/\Auni[A-F\d]{4}\Z/)
56
+ "0x#{str[3,4]}".hex
57
+ elsif str.match(/\Au[A-F\d]{4,6}\Z/)
58
+ "0x#{str[1,6]}".hex
59
+ elsif str.match(/\A[A-Za-z]\d{2,4}\Z/)
60
+ str[1,4].to_i
61
+ elsif str.match(/\A[A-Za-z]{2}\d{2,4}\Z/)
62
+ str[2,4].to_i
63
+ else
64
+ nil
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
71
+ # a text file supplied by Adobe at:
72
+ # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
73
+ def load_adobe_glyph_mapping
74
+ glyphs = {}
75
+
76
+ RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
77
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
78
+ f.each do |l|
79
+ m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
80
+ glyphs[name.to_sym] = "0x#{code}".hex if name
81
+ end
82
+ end
83
+
84
+ glyphs
85
+ end
86
+
87
+ end
88
+ end