pdf-reader 0.8.6 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,10 +9,10 @@
9
9
  # distribute, sublicense, and/or sell copies of the Software, and to
10
10
  # permit persons to whom the Software is furnished to do so, subject to
11
11
  # the following conditions:
12
- #
12
+ #
13
13
  # The above copyright notice and this permission notice shall be
14
14
  # included in all copies or substantial portions of the Software.
15
- #
15
+ #
16
16
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
17
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
18
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -24,6 +24,8 @@
24
24
  ################################################################################
25
25
 
26
26
  require 'stringio'
27
+ require 'zlib'
28
+
27
29
  require 'ascii85'
28
30
 
29
31
  module PDF
@@ -37,77 +39,127 @@ module PDF
37
39
  # on receivers.
38
40
  #
39
41
  # = Parsing a file
40
- #
42
+ #
41
43
  # PDF::Reader.file("somefile.pdf", receiver)
42
44
  #
43
45
  # = Parsing a String
44
- #
46
+ #
45
47
  # This is useful for processing a PDF that is already in memory
46
48
  #
47
49
  # PDF::Reader.string(pdf_string, receiver)
48
50
  #
49
51
  # = Parsing an IO object
50
- #
52
+ #
51
53
  # This can be a useful alternative to the first 2 options in some situations
52
54
  #
53
55
  # pdf = PDF::Reader.new
54
56
  # pdf.parse(File.new("somefile.pdf"), receiver)
55
57
  #
56
58
  # = Parsing parts of a file
57
- #
58
- # Both PDF::Reader#file and PDF::Reader#string accept a 3 argument that specifies which
59
- # parts of the file to process. By default, all options are enabled, so this can be useful
60
- # to cut down processing time if you're only interested in say, metadata.
61
59
  #
62
- # As an example, the following call will disable parsing the contents of pages in the file,
63
- # but explicitly enables processing metadata.
60
+ # Both PDF::Reader#file and PDF::Reader#string accept a third argument that
61
+ # specifies which parts of the file to process. By default, all options are
62
+ # enabled, so this can be useful to cut down processing time if you're only
63
+ # interested in say, metadata.
64
+ #
65
+ # As an example, the following call will disable parsing the contents of
66
+ # pages in the file, but explicitly enables processing metadata.
64
67
  #
65
68
  # PDF::Reader.new("somefile.pdf", receiver, {:metadata => true, :pages => false})
66
69
  #
67
70
  # Available options are currently:
68
- #
71
+ #
69
72
  # :metadata
70
73
  # :pages
74
+ # :raw_text
75
+ #
71
76
  class Reader
72
- ################################################################################
77
+
73
78
  # Parse the file with the given name, sending events to the given receiver.
79
+ #
74
80
  def self.file(name, receiver, opts = {})
75
81
  File.open(name,"rb") do |f|
76
82
  new.parse(f, receiver, opts)
77
83
  end
78
84
  end
79
- ################################################################################
85
+
80
86
  # Parse the given string, sending events to the given receiver.
87
+ #
81
88
  def self.string(str, receiver, opts = {})
82
89
  StringIO.open(str) do |s|
83
90
  new.parse(s, receiver, opts)
84
91
  end
85
92
  end
86
- ################################################################################
87
- def self.object_file(name, id, gen)
88
- File.open(name,"rb") do |f|
89
- new.object(f, id, gen)
90
- end
93
+
94
+ # Parse the file with the given name, returning an unmarshalled ruby version of
95
+ # represents the requested pdf object
96
+ #
97
+ def self.object_file(name, id, gen = 0)
98
+ File.open(name,"rb") { |f|
99
+ new.object(f, id.to_i, gen.to_i)
100
+ }
91
101
  end
92
- ################################################################################
93
- def self.object_string(name, id, gen)
94
- StringIO.open(str) do |s|
95
- new.object(s, id, gen)
102
+
103
+ # Parse the given string, returning an unmarshalled ruby version of represents
104
+ # the requested pdf object
105
+ #
106
+ def self.object_string(str, id, gen = 0)
107
+ StringIO.open(str) { |s|
108
+ new.object(s, id.to_i, gen.to_i)
109
+ }
110
+ end
111
+
112
+ # Given an IO object that contains PDF data, parse it.
113
+ #
114
+ def parse(io, receiver, opts = {})
115
+ ohash = ObjectHash.new(io)
116
+
117
+ if ohash.trailer[:Encrypt]
118
+ raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
119
+ end
120
+
121
+ options = {:pages => true, :raw_text => false, :metadata => true}
122
+ options.merge!(opts)
123
+
124
+ strategies.each do |s|
125
+ s.new(ohash, receiver, options).process
96
126
  end
127
+
128
+ self
129
+ end
130
+
131
+ # Given an IO object that contains PDF data, return the contents of a single object
132
+ #
133
+ def object (io, id, gen)
134
+ @ohash = ObjectHash.new(io)
135
+
136
+ @ohash.object(Reference.new(id, gen))
137
+ end
138
+
139
+ private
140
+
141
+ def strategies
142
+ @strategies ||= [
143
+ PDF::Reader::MetadataStrategy,
144
+ PDF::Reader::PagesStrategy
145
+ ]
97
146
  end
98
- ################################################################################
99
147
  end
100
- ################################################################################
101
148
  end
102
149
  ################################################################################
103
- require 'pdf/reader/explore'
150
+
151
+ require 'pdf/reader/abstract_strategy'
104
152
  require 'pdf/reader/buffer'
105
153
  require 'pdf/reader/cmap'
106
- require 'pdf/reader/content'
107
154
  require 'pdf/reader/encoding'
108
155
  require 'pdf/reader/error'
109
156
  require 'pdf/reader/filter'
110
157
  require 'pdf/reader/font'
158
+ require 'pdf/reader/lzw'
159
+ require 'pdf/reader/metadata_strategy'
160
+ require 'pdf/reader/object_hash'
161
+ require 'pdf/reader/object_stream'
162
+ require 'pdf/reader/pages_strategy'
111
163
  require 'pdf/reader/parser'
112
164
  require 'pdf/reader/print_receiver'
113
165
  require 'pdf/reader/reference'
@@ -117,31 +169,3 @@ require 'pdf/reader/text_receiver'
117
169
  require 'pdf/reader/token'
118
170
  require 'pdf/reader/xref'
119
171
  require 'pdf/hash'
120
-
121
- class PDF::Reader
122
- ################################################################################
123
- # Given an IO object that contains PDF data, parse it.
124
- def parse (io, receiver, opts = {})
125
- @xref = XRef.new(io)
126
- @content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
127
-
128
- options = {:pages => true, :metadata => true}
129
- options.merge!(opts)
130
-
131
- trailer = @xref.load
132
- raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
133
- @content.metadata(@xref.object(trailer[:Root]), @xref.object(trailer[:Info])) if options[:metadata]
134
- @content.document(@xref.object(trailer[:Root])) if options[:pages]
135
- self
136
- end
137
- ################################################################################
138
- # Given an IO object that contains PDF data, return the contents of a single object
139
- def object (io, id, gen)
140
- @xref = XRef.new(io)
141
- @xref.load
142
-
143
- @xref.object(Reference.new(id, gen))
144
- end
145
- ################################################################################
146
- end
147
- ################################################################################
@@ -0,0 +1,77 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ class AbstractStrategy # :nodoc:
6
+
7
+ def initialize(ohash, receiver, options = {})
8
+ @ohash, @receiver, @options = ohash, receiver, options
9
+ end
10
+
11
+ private
12
+
13
+ def options
14
+ @options || {}
15
+ end
16
+
17
+ # calls the name callback method on the receiver class with params as the arguments
18
+ #
19
+ def callback (name, params=[])
20
+ receiver.send(name, *params) if receiver.respond_to?(name)
21
+ end
22
+
23
+ # strings outside of page content should be in either PDFDocEncoding or UTF-16.
24
+ def decode_strings(obj)
25
+ case obj
26
+ when String then
27
+ if obj[0,2].unpack("C*").slice(0,2) == [254,255]
28
+ PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
29
+ else
30
+ PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
31
+ end
32
+ when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
33
+ when Array then obj.collect { |item| decode_strings(item) }
34
+ else
35
+ obj
36
+ end
37
+ end
38
+
39
+ def info
40
+ ohash.object(trailer[:Info])
41
+ end
42
+
43
+ def info?
44
+ info ? true : false
45
+ end
46
+
47
+ def ohash
48
+ @ohash
49
+ end
50
+
51
+ def pages
52
+ ohash.object(root[:Pages])
53
+ end
54
+
55
+ def pages?
56
+ pages ? true : false
57
+ end
58
+
59
+ def receiver
60
+ @receiver
61
+ end
62
+
63
+ def root
64
+ ohash.object(trailer[:Root])
65
+ end
66
+
67
+ def root?
68
+ root ? true : false
69
+ end
70
+
71
+ def trailer
72
+ ohash.trailer
73
+ end
74
+
75
+ end
76
+ end
77
+
@@ -48,11 +48,13 @@ class PDF::Reader
48
48
  # options:
49
49
  #
50
50
  # :seek - a byte offset to seek to before starting to tokenise
51
+ # :content_stream - set to true if buffer will be tokenising a
52
+ # content stream. Defaults to false
51
53
  #
52
54
  def initialize (io, opts = {})
53
55
  @io = io
54
56
  @tokens = []
55
- @options = opts
57
+ @in_content_stream = opts[:content_stream]
56
58
 
57
59
  @io.seek(opts[:seek]) if opts[:seek]
58
60
  @pos = @io.pos
@@ -98,30 +100,6 @@ class PDF::Reader
98
100
  bytes
99
101
  end
100
102
 
101
- # return raw bytes from the underlying IO stream. All bytes up to the first
102
- # occurrence of needle will be returned. The match (if any) is not returned.
103
- # The IO stream cursor is left on the first byte of the match.
104
- #
105
- # needle - a string to search the IO stream for
106
- #
107
- def read_until(needle)
108
- reset_pos
109
- out = ""
110
- size = needle.size
111
-
112
- while out[size * -1, size] != needle && !@io.eof?
113
- out << @io.read(1)
114
- end
115
-
116
- if out[size * -1, size] == needle
117
- out = out[0, out.size - size]
118
- @io.seek(size * -1, IO::SEEK_CUR)
119
- end
120
-
121
- save_pos
122
- out
123
- end
124
-
125
103
  # return the next token from the source. Returns a string if a token
126
104
  # is found, nil if there are no tokens left.
127
105
  #
@@ -141,19 +119,8 @@ class PDF::Reader
141
119
  data = @io.read(1024)
142
120
 
143
121
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
144
- # To ensure we find the xref offset correctly, change all possible options to a
145
- # standard format
146
- data = data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")
147
- lines = data.split(/\n/).reverse
148
-
149
- eof_index = nil
150
-
151
- lines.each_with_index do |line, index|
152
- if line =~ /^%%EOF\r?$/
153
- eof_index = index
154
- break
155
- end
156
- end
122
+ lines = data.split(/[\n\r]+/).reverse
123
+ eof_index = lines.index { |l| l.strip == "%%EOF" }
157
124
 
158
125
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
159
126
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
@@ -162,6 +129,12 @@ class PDF::Reader
162
129
 
163
130
  private
164
131
 
132
+ # Returns true if this buffer is parsing a content stream
133
+ #
134
+ def in_content_stream?
135
+ @in_content_stream ? true : false
136
+ end
137
+
165
138
  # Some bastard moved our IO stream cursor. Restore it.
166
139
  #
167
140
  def reset_pos
@@ -181,8 +154,12 @@ class PDF::Reader
181
154
  10.times do
182
155
  if state == :literal_string
183
156
  prepare_literal_token
157
+ elsif state == :hex_string
158
+ prepare_hex_token
184
159
  elsif state == :regular
185
160
  prepare_regular_token
161
+ elsif state == :inline
162
+ prepare_inline_token
186
163
  end
187
164
  end
188
165
 
@@ -195,8 +172,12 @@ class PDF::Reader
195
172
  def state
196
173
  if @tokens[-1] == "("
197
174
  :literal_string
175
+ elsif @tokens[-1] == "<"
176
+ :hex_string
198
177
  elsif @tokens[-1] == "stream"
199
178
  :stream
179
+ elsif in_content_stream? && @tokens[-1] == "ID"
180
+ :inline
200
181
  else
201
182
  :regular
202
183
  end
@@ -226,6 +207,44 @@ class PDF::Reader
226
207
  end
227
208
  end
228
209
 
210
+ def prepare_inline_token
211
+ str = ""
212
+
213
+ while str[-2,2] != "EI"
214
+ chr = @io.read(1)
215
+ break if chr.nil?
216
+ str << chr
217
+ end
218
+
219
+ @tokens << str[0, str.size-2].strip
220
+ @io.seek(-2, IO::SEEK_CUR) unless chr.nil?
221
+ end
222
+
223
+ # if we're currently inside a hex string, read hex nibbles until
224
+ # we find a closing >
225
+ #
226
+ def prepare_hex_token
227
+ str = ""
228
+ finished = false
229
+
230
+ while !finished
231
+ chr = @io.read(1)
232
+ codepoint = chr.to_s.unpack("C*").first
233
+ if chr.nil?
234
+ finished = true # unbalanced params
235
+ elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
236
+ str << chr
237
+ elsif codepoint <= 32
238
+ # ignore it
239
+ else
240
+ @tokens << str if str.size > 0
241
+ @tokens << ">" if chr != ">"
242
+ @tokens << chr
243
+ finished = true
244
+ end
245
+ end
246
+ end
247
+
229
248
  # if we're currently inside a literal string we more or less just read bytes until
230
249
  # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
231
250
  # start of a new token in regular mode are left untouched when inside a literal
@@ -243,10 +262,12 @@ class PDF::Reader
243
262
  chr = @io.read(1)
244
263
  if chr.nil?
245
264
  count = 0 # unbalanced params
246
- elsif chr == "(" && str[-1,1] != "\x5C"
265
+ elsif chr == "\x5c"
266
+ str << chr << @io.read(1).to_s
267
+ elsif chr == "("
247
268
  str << "("
248
269
  count += 1
249
- elsif chr == ")" && str[-1,1] != "\x5C"
270
+ elsif chr == ")"
250
271
  count -= 1
251
272
  str << ")" unless count == 0
252
273
  else
@@ -24,30 +24,31 @@
24
24
  ################################################################################
25
25
 
26
26
  class PDF::Reader
27
- class CMap
27
+ class CMap # :nodoc:
28
28
 
29
29
  def initialize(data)
30
30
  @map = {}
31
- in_char_mode = false
32
- in_range_mode = false
31
+ process_data(data)
32
+ end
33
+
34
+ def process_data(data)
35
+ mode = nil
33
36
  instructions = ""
34
37
 
35
38
  data.each_line do |l|
36
39
  if l.include?("beginbfchar")
37
- in_char_mode = true
40
+ mode = :char
38
41
  elsif l.include?("endbfchar")
39
42
  process_bfchar_instructions(instructions)
40
43
  instructions = ""
41
- in_char_mode = false
44
+ mode = nil
42
45
  elsif l.include?("beginbfrange")
43
- in_range_mode = true
46
+ mode = :range
44
47
  elsif l.include?("endbfrange")
45
48
  process_bfrange_instructions(instructions)
46
49
  instructions = ""
47
- in_range_mode = false
48
- end
49
-
50
- if !l.include?("begin") && (in_char_mode || in_range_mode)
50
+ mode = nil
51
+ elsif mode == :char || mode == :range
51
52
  instructions << l
52
53
  end
53
54
  end