pdf-reader 0.8.6 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,10 +9,10 @@
9
9
  # distribute, sublicense, and/or sell copies of the Software, and to
10
10
  # permit persons to whom the Software is furnished to do so, subject to
11
11
  # the following conditions:
12
- #
12
+ #
13
13
  # The above copyright notice and this permission notice shall be
14
14
  # included in all copies or substantial portions of the Software.
15
- #
15
+ #
16
16
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
17
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
18
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -24,6 +24,8 @@
24
24
  ################################################################################
25
25
 
26
26
  require 'stringio'
27
+ require 'zlib'
28
+
27
29
  require 'ascii85'
28
30
 
29
31
  module PDF
@@ -37,77 +39,127 @@ module PDF
37
39
  # on receivers.
38
40
  #
39
41
  # = Parsing a file
40
- #
42
+ #
41
43
  # PDF::Reader.file("somefile.pdf", receiver)
42
44
  #
43
45
  # = Parsing a String
44
- #
46
+ #
45
47
  # This is useful for processing a PDF that is already in memory
46
48
  #
47
49
  # PDF::Reader.string(pdf_string, receiver)
48
50
  #
49
51
  # = Parsing an IO object
50
- #
52
+ #
51
53
  # This can be a useful alternative to the first 2 options in some situations
52
54
  #
53
55
  # pdf = PDF::Reader.new
54
56
  # pdf.parse(File.new("somefile.pdf"), receiver)
55
57
  #
56
58
  # = Parsing parts of a file
57
- #
58
- # Both PDF::Reader#file and PDF::Reader#string accept a 3 argument that specifies which
59
- # parts of the file to process. By default, all options are enabled, so this can be useful
60
- # to cut down processing time if you're only interested in say, metadata.
61
59
  #
62
- # As an example, the following call will disable parsing the contents of pages in the file,
63
- # but explicitly enables processing metadata.
60
+ # Both PDF::Reader#file and PDF::Reader#string accept a third argument that
61
+ # specifies which parts of the file to process. By default, all options are
62
+ # enabled, so this can be useful to cut down processing time if you're only
63
+ # interested in say, metadata.
64
+ #
65
+ # As an example, the following call will disable parsing the contents of
66
+ # pages in the file, but explicitly enables processing metadata.
64
67
  #
65
68
  # PDF::Reader.new("somefile.pdf", receiver, {:metadata => true, :pages => false})
66
69
  #
67
70
  # Available options are currently:
68
- #
71
+ #
69
72
  # :metadata
70
73
  # :pages
74
+ # :raw_text
75
+ #
71
76
  class Reader
72
- ################################################################################
77
+
73
78
  # Parse the file with the given name, sending events to the given receiver.
79
+ #
74
80
  def self.file(name, receiver, opts = {})
75
81
  File.open(name,"rb") do |f|
76
82
  new.parse(f, receiver, opts)
77
83
  end
78
84
  end
79
- ################################################################################
85
+
80
86
  # Parse the given string, sending events to the given receiver.
87
+ #
81
88
  def self.string(str, receiver, opts = {})
82
89
  StringIO.open(str) do |s|
83
90
  new.parse(s, receiver, opts)
84
91
  end
85
92
  end
86
- ################################################################################
87
- def self.object_file(name, id, gen)
88
- File.open(name,"rb") do |f|
89
- new.object(f, id, gen)
90
- end
93
+
94
+ # Parse the file with the given name, returning an unmarshalled ruby version of
95
+ # represents the requested pdf object
96
+ #
97
+ def self.object_file(name, id, gen = 0)
98
+ File.open(name,"rb") { |f|
99
+ new.object(f, id.to_i, gen.to_i)
100
+ }
91
101
  end
92
- ################################################################################
93
- def self.object_string(name, id, gen)
94
- StringIO.open(str) do |s|
95
- new.object(s, id, gen)
102
+
103
+ # Parse the given string, returning an unmarshalled ruby version of represents
104
+ # the requested pdf object
105
+ #
106
+ def self.object_string(str, id, gen = 0)
107
+ StringIO.open(str) { |s|
108
+ new.object(s, id.to_i, gen.to_i)
109
+ }
110
+ end
111
+
112
+ # Given an IO object that contains PDF data, parse it.
113
+ #
114
+ def parse(io, receiver, opts = {})
115
+ ohash = ObjectHash.new(io)
116
+
117
+ if ohash.trailer[:Encrypt]
118
+ raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
119
+ end
120
+
121
+ options = {:pages => true, :raw_text => false, :metadata => true}
122
+ options.merge!(opts)
123
+
124
+ strategies.each do |s|
125
+ s.new(ohash, receiver, options).process
96
126
  end
127
+
128
+ self
129
+ end
130
+
131
+ # Given an IO object that contains PDF data, return the contents of a single object
132
+ #
133
+ def object (io, id, gen)
134
+ @ohash = ObjectHash.new(io)
135
+
136
+ @ohash.object(Reference.new(id, gen))
137
+ end
138
+
139
+ private
140
+
141
+ def strategies
142
+ @strategies ||= [
143
+ PDF::Reader::MetadataStrategy,
144
+ PDF::Reader::PagesStrategy
145
+ ]
97
146
  end
98
- ################################################################################
99
147
  end
100
- ################################################################################
101
148
  end
102
149
  ################################################################################
103
- require 'pdf/reader/explore'
150
+
151
+ require 'pdf/reader/abstract_strategy'
104
152
  require 'pdf/reader/buffer'
105
153
  require 'pdf/reader/cmap'
106
- require 'pdf/reader/content'
107
154
  require 'pdf/reader/encoding'
108
155
  require 'pdf/reader/error'
109
156
  require 'pdf/reader/filter'
110
157
  require 'pdf/reader/font'
158
+ require 'pdf/reader/lzw'
159
+ require 'pdf/reader/metadata_strategy'
160
+ require 'pdf/reader/object_hash'
161
+ require 'pdf/reader/object_stream'
162
+ require 'pdf/reader/pages_strategy'
111
163
  require 'pdf/reader/parser'
112
164
  require 'pdf/reader/print_receiver'
113
165
  require 'pdf/reader/reference'
@@ -117,31 +169,3 @@ require 'pdf/reader/text_receiver'
117
169
  require 'pdf/reader/token'
118
170
  require 'pdf/reader/xref'
119
171
  require 'pdf/hash'
120
-
121
- class PDF::Reader
122
- ################################################################################
123
- # Given an IO object that contains PDF data, parse it.
124
- def parse (io, receiver, opts = {})
125
- @xref = XRef.new(io)
126
- @content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
127
-
128
- options = {:pages => true, :metadata => true}
129
- options.merge!(opts)
130
-
131
- trailer = @xref.load
132
- raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
133
- @content.metadata(@xref.object(trailer[:Root]), @xref.object(trailer[:Info])) if options[:metadata]
134
- @content.document(@xref.object(trailer[:Root])) if options[:pages]
135
- self
136
- end
137
- ################################################################################
138
- # Given an IO object that contains PDF data, return the contents of a single object
139
- def object (io, id, gen)
140
- @xref = XRef.new(io)
141
- @xref.load
142
-
143
- @xref.object(Reference.new(id, gen))
144
- end
145
- ################################################################################
146
- end
147
- ################################################################################
@@ -0,0 +1,77 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ class AbstractStrategy # :nodoc:
6
+
7
+ def initialize(ohash, receiver, options = {})
8
+ @ohash, @receiver, @options = ohash, receiver, options
9
+ end
10
+
11
+ private
12
+
13
+ def options
14
+ @options || {}
15
+ end
16
+
17
+ # calls the name callback method on the receiver class with params as the arguments
18
+ #
19
+ def callback (name, params=[])
20
+ receiver.send(name, *params) if receiver.respond_to?(name)
21
+ end
22
+
23
+ # strings outside of page content should be in either PDFDocEncoding or UTF-16.
24
+ def decode_strings(obj)
25
+ case obj
26
+ when String then
27
+ if obj[0,2].unpack("C*").slice(0,2) == [254,255]
28
+ PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
29
+ else
30
+ PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
31
+ end
32
+ when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
33
+ when Array then obj.collect { |item| decode_strings(item) }
34
+ else
35
+ obj
36
+ end
37
+ end
38
+
39
+ def info
40
+ ohash.object(trailer[:Info])
41
+ end
42
+
43
+ def info?
44
+ info ? true : false
45
+ end
46
+
47
+ def ohash
48
+ @ohash
49
+ end
50
+
51
+ def pages
52
+ ohash.object(root[:Pages])
53
+ end
54
+
55
+ def pages?
56
+ pages ? true : false
57
+ end
58
+
59
+ def receiver
60
+ @receiver
61
+ end
62
+
63
+ def root
64
+ ohash.object(trailer[:Root])
65
+ end
66
+
67
+ def root?
68
+ root ? true : false
69
+ end
70
+
71
+ def trailer
72
+ ohash.trailer
73
+ end
74
+
75
+ end
76
+ end
77
+
@@ -48,11 +48,13 @@ class PDF::Reader
48
48
  # options:
49
49
  #
50
50
  # :seek - a byte offset to seek to before starting to tokenise
51
+ # :content_stream - set to true if buffer will be tokenising a
52
+ # content stream. Defaults to false
51
53
  #
52
54
  def initialize (io, opts = {})
53
55
  @io = io
54
56
  @tokens = []
55
- @options = opts
57
+ @in_content_stream = opts[:content_stream]
56
58
 
57
59
  @io.seek(opts[:seek]) if opts[:seek]
58
60
  @pos = @io.pos
@@ -98,30 +100,6 @@ class PDF::Reader
98
100
  bytes
99
101
  end
100
102
 
101
- # return raw bytes from the underlying IO stream. All bytes up to the first
102
- # occurrence of needle will be returned. The match (if any) is not returned.
103
- # The IO stream cursor is left on the first byte of the match.
104
- #
105
- # needle - a string to search the IO stream for
106
- #
107
- def read_until(needle)
108
- reset_pos
109
- out = ""
110
- size = needle.size
111
-
112
- while out[size * -1, size] != needle && !@io.eof?
113
- out << @io.read(1)
114
- end
115
-
116
- if out[size * -1, size] == needle
117
- out = out[0, out.size - size]
118
- @io.seek(size * -1, IO::SEEK_CUR)
119
- end
120
-
121
- save_pos
122
- out
123
- end
124
-
125
103
  # return the next token from the source. Returns a string if a token
126
104
  # is found, nil if there are no tokens left.
127
105
  #
@@ -141,19 +119,8 @@ class PDF::Reader
141
119
  data = @io.read(1024)
142
120
 
143
121
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
144
- # To ensure we find the xref offset correctly, change all possible options to a
145
- # standard format
146
- data = data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")
147
- lines = data.split(/\n/).reverse
148
-
149
- eof_index = nil
150
-
151
- lines.each_with_index do |line, index|
152
- if line =~ /^%%EOF\r?$/
153
- eof_index = index
154
- break
155
- end
156
- end
122
+ lines = data.split(/[\n\r]+/).reverse
123
+ eof_index = lines.index { |l| l.strip == "%%EOF" }
157
124
 
158
125
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
159
126
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
@@ -162,6 +129,12 @@ class PDF::Reader
162
129
 
163
130
  private
164
131
 
132
+ # Returns true if this buffer is parsing a content stream
133
+ #
134
+ def in_content_stream?
135
+ @in_content_stream ? true : false
136
+ end
137
+
165
138
  # Some bastard moved our IO stream cursor. Restore it.
166
139
  #
167
140
  def reset_pos
@@ -181,8 +154,12 @@ class PDF::Reader
181
154
  10.times do
182
155
  if state == :literal_string
183
156
  prepare_literal_token
157
+ elsif state == :hex_string
158
+ prepare_hex_token
184
159
  elsif state == :regular
185
160
  prepare_regular_token
161
+ elsif state == :inline
162
+ prepare_inline_token
186
163
  end
187
164
  end
188
165
 
@@ -195,8 +172,12 @@ class PDF::Reader
195
172
  def state
196
173
  if @tokens[-1] == "("
197
174
  :literal_string
175
+ elsif @tokens[-1] == "<"
176
+ :hex_string
198
177
  elsif @tokens[-1] == "stream"
199
178
  :stream
179
+ elsif in_content_stream? && @tokens[-1] == "ID"
180
+ :inline
200
181
  else
201
182
  :regular
202
183
  end
@@ -226,6 +207,44 @@ class PDF::Reader
226
207
  end
227
208
  end
228
209
 
210
+ def prepare_inline_token
211
+ str = ""
212
+
213
+ while str[-2,2] != "EI"
214
+ chr = @io.read(1)
215
+ break if chr.nil?
216
+ str << chr
217
+ end
218
+
219
+ @tokens << str[0, str.size-2].strip
220
+ @io.seek(-2, IO::SEEK_CUR) unless chr.nil?
221
+ end
222
+
223
+ # if we're currently inside a hex string, read hex nibbles until
224
+ # we find a closing >
225
+ #
226
+ def prepare_hex_token
227
+ str = ""
228
+ finished = false
229
+
230
+ while !finished
231
+ chr = @io.read(1)
232
+ codepoint = chr.to_s.unpack("C*").first
233
+ if chr.nil?
234
+ finished = true # unbalanced params
235
+ elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
236
+ str << chr
237
+ elsif codepoint <= 32
238
+ # ignore it
239
+ else
240
+ @tokens << str if str.size > 0
241
+ @tokens << ">" if chr != ">"
242
+ @tokens << chr
243
+ finished = true
244
+ end
245
+ end
246
+ end
247
+
229
248
  # if we're currently inside a literal string we more or less just read bytes until
230
249
  # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
231
250
  # start of a new token in regular mode are left untouched when inside a literal
@@ -243,10 +262,12 @@ class PDF::Reader
243
262
  chr = @io.read(1)
244
263
  if chr.nil?
245
264
  count = 0 # unbalanced params
246
- elsif chr == "(" && str[-1,1] != "\x5C"
265
+ elsif chr == "\x5c"
266
+ str << chr << @io.read(1).to_s
267
+ elsif chr == "("
247
268
  str << "("
248
269
  count += 1
249
- elsif chr == ")" && str[-1,1] != "\x5C"
270
+ elsif chr == ")"
250
271
  count -= 1
251
272
  str << ")" unless count == 0
252
273
  else
@@ -24,30 +24,31 @@
24
24
  ################################################################################
25
25
 
26
26
  class PDF::Reader
27
- class CMap
27
+ class CMap # :nodoc:
28
28
 
29
29
  def initialize(data)
30
30
  @map = {}
31
- in_char_mode = false
32
- in_range_mode = false
31
+ process_data(data)
32
+ end
33
+
34
+ def process_data(data)
35
+ mode = nil
33
36
  instructions = ""
34
37
 
35
38
  data.each_line do |l|
36
39
  if l.include?("beginbfchar")
37
- in_char_mode = true
40
+ mode = :char
38
41
  elsif l.include?("endbfchar")
39
42
  process_bfchar_instructions(instructions)
40
43
  instructions = ""
41
- in_char_mode = false
44
+ mode = nil
42
45
  elsif l.include?("beginbfrange")
43
- in_range_mode = true
46
+ mode = :range
44
47
  elsif l.include?("endbfrange")
45
48
  process_bfrange_instructions(instructions)
46
49
  instructions = ""
47
- in_range_mode = false
48
- end
49
-
50
- if !l.include?("begin") && (in_char_mode || in_range_mode)
50
+ mode = nil
51
+ elsif mode == :char || mode == :range
51
52
  instructions << l
52
53
  end
53
54
  end