pdf-reader 0.8.6 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/lib/pdf/reader.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -24,6 +24,8 @@
|
|
24
24
|
################################################################################
|
25
25
|
|
26
26
|
require 'stringio'
|
27
|
+
require 'zlib'
|
28
|
+
|
27
29
|
require 'ascii85'
|
28
30
|
|
29
31
|
module PDF
|
@@ -37,77 +39,127 @@ module PDF
|
|
37
39
|
# on receivers.
|
38
40
|
#
|
39
41
|
# = Parsing a file
|
40
|
-
#
|
42
|
+
#
|
41
43
|
# PDF::Reader.file("somefile.pdf", receiver)
|
42
44
|
#
|
43
45
|
# = Parsing a String
|
44
|
-
#
|
46
|
+
#
|
45
47
|
# This is useful for processing a PDF that is already in memory
|
46
48
|
#
|
47
49
|
# PDF::Reader.string(pdf_string, receiver)
|
48
50
|
#
|
49
51
|
# = Parsing an IO object
|
50
|
-
#
|
52
|
+
#
|
51
53
|
# This can be a useful alternative to the first 2 options in some situations
|
52
54
|
#
|
53
55
|
# pdf = PDF::Reader.new
|
54
56
|
# pdf.parse(File.new("somefile.pdf"), receiver)
|
55
57
|
#
|
56
58
|
# = Parsing parts of a file
|
57
|
-
#
|
58
|
-
# Both PDF::Reader#file and PDF::Reader#string accept a 3 argument that specifies which
|
59
|
-
# parts of the file to process. By default, all options are enabled, so this can be useful
|
60
|
-
# to cut down processing time if you're only interested in say, metadata.
|
61
59
|
#
|
62
|
-
#
|
63
|
-
#
|
60
|
+
# Both PDF::Reader#file and PDF::Reader#string accept a third argument that
|
61
|
+
# specifies which parts of the file to process. By default, all options are
|
62
|
+
# enabled, so this can be useful to cut down processing time if you're only
|
63
|
+
# interested in say, metadata.
|
64
|
+
#
|
65
|
+
# As an example, the following call will disable parsing the contents of
|
66
|
+
# pages in the file, but explicitly enables processing metadata.
|
64
67
|
#
|
65
68
|
# PDF::Reader.new("somefile.pdf", receiver, {:metadata => true, :pages => false})
|
66
69
|
#
|
67
70
|
# Available options are currently:
|
68
|
-
#
|
71
|
+
#
|
69
72
|
# :metadata
|
70
73
|
# :pages
|
74
|
+
# :raw_text
|
75
|
+
#
|
71
76
|
class Reader
|
72
|
-
|
77
|
+
|
73
78
|
# Parse the file with the given name, sending events to the given receiver.
|
79
|
+
#
|
74
80
|
def self.file(name, receiver, opts = {})
|
75
81
|
File.open(name,"rb") do |f|
|
76
82
|
new.parse(f, receiver, opts)
|
77
83
|
end
|
78
84
|
end
|
79
|
-
|
85
|
+
|
80
86
|
# Parse the given string, sending events to the given receiver.
|
87
|
+
#
|
81
88
|
def self.string(str, receiver, opts = {})
|
82
89
|
StringIO.open(str) do |s|
|
83
90
|
new.parse(s, receiver, opts)
|
84
91
|
end
|
85
92
|
end
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
93
|
+
|
94
|
+
# Parse the file with the given name, returning an unmarshalled ruby version of
|
95
|
+
# represents the requested pdf object
|
96
|
+
#
|
97
|
+
def self.object_file(name, id, gen = 0)
|
98
|
+
File.open(name,"rb") { |f|
|
99
|
+
new.object(f, id.to_i, gen.to_i)
|
100
|
+
}
|
91
101
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
102
|
+
|
103
|
+
# Parse the given string, returning an unmarshalled ruby version of represents
|
104
|
+
# the requested pdf object
|
105
|
+
#
|
106
|
+
def self.object_string(str, id, gen = 0)
|
107
|
+
StringIO.open(str) { |s|
|
108
|
+
new.object(s, id.to_i, gen.to_i)
|
109
|
+
}
|
110
|
+
end
|
111
|
+
|
112
|
+
# Given an IO object that contains PDF data, parse it.
|
113
|
+
#
|
114
|
+
def parse(io, receiver, opts = {})
|
115
|
+
ohash = ObjectHash.new(io)
|
116
|
+
|
117
|
+
if ohash.trailer[:Encrypt]
|
118
|
+
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
119
|
+
end
|
120
|
+
|
121
|
+
options = {:pages => true, :raw_text => false, :metadata => true}
|
122
|
+
options.merge!(opts)
|
123
|
+
|
124
|
+
strategies.each do |s|
|
125
|
+
s.new(ohash, receiver, options).process
|
96
126
|
end
|
127
|
+
|
128
|
+
self
|
129
|
+
end
|
130
|
+
|
131
|
+
# Given an IO object that contains PDF data, return the contents of a single object
|
132
|
+
#
|
133
|
+
def object (io, id, gen)
|
134
|
+
@ohash = ObjectHash.new(io)
|
135
|
+
|
136
|
+
@ohash.object(Reference.new(id, gen))
|
137
|
+
end
|
138
|
+
|
139
|
+
private
|
140
|
+
|
141
|
+
def strategies
|
142
|
+
@strategies ||= [
|
143
|
+
PDF::Reader::MetadataStrategy,
|
144
|
+
PDF::Reader::PagesStrategy
|
145
|
+
]
|
97
146
|
end
|
98
|
-
################################################################################
|
99
147
|
end
|
100
|
-
################################################################################
|
101
148
|
end
|
102
149
|
################################################################################
|
103
|
-
|
150
|
+
|
151
|
+
require 'pdf/reader/abstract_strategy'
|
104
152
|
require 'pdf/reader/buffer'
|
105
153
|
require 'pdf/reader/cmap'
|
106
|
-
require 'pdf/reader/content'
|
107
154
|
require 'pdf/reader/encoding'
|
108
155
|
require 'pdf/reader/error'
|
109
156
|
require 'pdf/reader/filter'
|
110
157
|
require 'pdf/reader/font'
|
158
|
+
require 'pdf/reader/lzw'
|
159
|
+
require 'pdf/reader/metadata_strategy'
|
160
|
+
require 'pdf/reader/object_hash'
|
161
|
+
require 'pdf/reader/object_stream'
|
162
|
+
require 'pdf/reader/pages_strategy'
|
111
163
|
require 'pdf/reader/parser'
|
112
164
|
require 'pdf/reader/print_receiver'
|
113
165
|
require 'pdf/reader/reference'
|
@@ -117,31 +169,3 @@ require 'pdf/reader/text_receiver'
|
|
117
169
|
require 'pdf/reader/token'
|
118
170
|
require 'pdf/reader/xref'
|
119
171
|
require 'pdf/hash'
|
120
|
-
|
121
|
-
class PDF::Reader
|
122
|
-
################################################################################
|
123
|
-
# Given an IO object that contains PDF data, parse it.
|
124
|
-
def parse (io, receiver, opts = {})
|
125
|
-
@xref = XRef.new(io)
|
126
|
-
@content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
|
127
|
-
|
128
|
-
options = {:pages => true, :metadata => true}
|
129
|
-
options.merge!(opts)
|
130
|
-
|
131
|
-
trailer = @xref.load
|
132
|
-
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
|
133
|
-
@content.metadata(@xref.object(trailer[:Root]), @xref.object(trailer[:Info])) if options[:metadata]
|
134
|
-
@content.document(@xref.object(trailer[:Root])) if options[:pages]
|
135
|
-
self
|
136
|
-
end
|
137
|
-
################################################################################
|
138
|
-
# Given an IO object that contains PDF data, return the contents of a single object
|
139
|
-
def object (io, id, gen)
|
140
|
-
@xref = XRef.new(io)
|
141
|
-
@xref.load
|
142
|
-
|
143
|
-
@xref.object(Reference.new(id, gen))
|
144
|
-
end
|
145
|
-
################################################################################
|
146
|
-
end
|
147
|
-
################################################################################
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
class AbstractStrategy # :nodoc:
|
6
|
+
|
7
|
+
def initialize(ohash, receiver, options = {})
|
8
|
+
@ohash, @receiver, @options = ohash, receiver, options
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def options
|
14
|
+
@options || {}
|
15
|
+
end
|
16
|
+
|
17
|
+
# calls the name callback method on the receiver class with params as the arguments
|
18
|
+
#
|
19
|
+
def callback (name, params=[])
|
20
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
21
|
+
end
|
22
|
+
|
23
|
+
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
24
|
+
def decode_strings(obj)
|
25
|
+
case obj
|
26
|
+
when String then
|
27
|
+
if obj[0,2].unpack("C*").slice(0,2) == [254,255]
|
28
|
+
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
29
|
+
else
|
30
|
+
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
31
|
+
end
|
32
|
+
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
33
|
+
when Array then obj.collect { |item| decode_strings(item) }
|
34
|
+
else
|
35
|
+
obj
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def info
|
40
|
+
ohash.object(trailer[:Info])
|
41
|
+
end
|
42
|
+
|
43
|
+
def info?
|
44
|
+
info ? true : false
|
45
|
+
end
|
46
|
+
|
47
|
+
def ohash
|
48
|
+
@ohash
|
49
|
+
end
|
50
|
+
|
51
|
+
def pages
|
52
|
+
ohash.object(root[:Pages])
|
53
|
+
end
|
54
|
+
|
55
|
+
def pages?
|
56
|
+
pages ? true : false
|
57
|
+
end
|
58
|
+
|
59
|
+
def receiver
|
60
|
+
@receiver
|
61
|
+
end
|
62
|
+
|
63
|
+
def root
|
64
|
+
ohash.object(trailer[:Root])
|
65
|
+
end
|
66
|
+
|
67
|
+
def root?
|
68
|
+
root ? true : false
|
69
|
+
end
|
70
|
+
|
71
|
+
def trailer
|
72
|
+
ohash.trailer
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -48,11 +48,13 @@ class PDF::Reader
|
|
48
48
|
# options:
|
49
49
|
#
|
50
50
|
# :seek - a byte offset to seek to before starting to tokenise
|
51
|
+
# :content_stream - set to true if buffer will be tokenising a
|
52
|
+
# content stream. Defaults to false
|
51
53
|
#
|
52
54
|
def initialize (io, opts = {})
|
53
55
|
@io = io
|
54
56
|
@tokens = []
|
55
|
-
@
|
57
|
+
@in_content_stream = opts[:content_stream]
|
56
58
|
|
57
59
|
@io.seek(opts[:seek]) if opts[:seek]
|
58
60
|
@pos = @io.pos
|
@@ -98,30 +100,6 @@ class PDF::Reader
|
|
98
100
|
bytes
|
99
101
|
end
|
100
102
|
|
101
|
-
# return raw bytes from the underlying IO stream. All bytes up to the first
|
102
|
-
# occurrence of needle will be returned. The match (if any) is not returned.
|
103
|
-
# The IO stream cursor is left on the first byte of the match.
|
104
|
-
#
|
105
|
-
# needle - a string to search the IO stream for
|
106
|
-
#
|
107
|
-
def read_until(needle)
|
108
|
-
reset_pos
|
109
|
-
out = ""
|
110
|
-
size = needle.size
|
111
|
-
|
112
|
-
while out[size * -1, size] != needle && !@io.eof?
|
113
|
-
out << @io.read(1)
|
114
|
-
end
|
115
|
-
|
116
|
-
if out[size * -1, size] == needle
|
117
|
-
out = out[0, out.size - size]
|
118
|
-
@io.seek(size * -1, IO::SEEK_CUR)
|
119
|
-
end
|
120
|
-
|
121
|
-
save_pos
|
122
|
-
out
|
123
|
-
end
|
124
|
-
|
125
103
|
# return the next token from the source. Returns a string if a token
|
126
104
|
# is found, nil if there are no tokens left.
|
127
105
|
#
|
@@ -141,19 +119,8 @@ class PDF::Reader
|
|
141
119
|
data = @io.read(1024)
|
142
120
|
|
143
121
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
144
|
-
|
145
|
-
|
146
|
-
data = data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")
|
147
|
-
lines = data.split(/\n/).reverse
|
148
|
-
|
149
|
-
eof_index = nil
|
150
|
-
|
151
|
-
lines.each_with_index do |line, index|
|
152
|
-
if line =~ /^%%EOF\r?$/
|
153
|
-
eof_index = index
|
154
|
-
break
|
155
|
-
end
|
156
|
-
end
|
122
|
+
lines = data.split(/[\n\r]+/).reverse
|
123
|
+
eof_index = lines.index { |l| l.strip == "%%EOF" }
|
157
124
|
|
158
125
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
159
126
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
@@ -162,6 +129,12 @@ class PDF::Reader
|
|
162
129
|
|
163
130
|
private
|
164
131
|
|
132
|
+
# Returns true if this buffer is parsing a content stream
|
133
|
+
#
|
134
|
+
def in_content_stream?
|
135
|
+
@in_content_stream ? true : false
|
136
|
+
end
|
137
|
+
|
165
138
|
# Some bastard moved our IO stream cursor. Restore it.
|
166
139
|
#
|
167
140
|
def reset_pos
|
@@ -181,8 +154,12 @@ class PDF::Reader
|
|
181
154
|
10.times do
|
182
155
|
if state == :literal_string
|
183
156
|
prepare_literal_token
|
157
|
+
elsif state == :hex_string
|
158
|
+
prepare_hex_token
|
184
159
|
elsif state == :regular
|
185
160
|
prepare_regular_token
|
161
|
+
elsif state == :inline
|
162
|
+
prepare_inline_token
|
186
163
|
end
|
187
164
|
end
|
188
165
|
|
@@ -195,8 +172,12 @@ class PDF::Reader
|
|
195
172
|
def state
|
196
173
|
if @tokens[-1] == "("
|
197
174
|
:literal_string
|
175
|
+
elsif @tokens[-1] == "<"
|
176
|
+
:hex_string
|
198
177
|
elsif @tokens[-1] == "stream"
|
199
178
|
:stream
|
179
|
+
elsif in_content_stream? && @tokens[-1] == "ID"
|
180
|
+
:inline
|
200
181
|
else
|
201
182
|
:regular
|
202
183
|
end
|
@@ -226,6 +207,44 @@ class PDF::Reader
|
|
226
207
|
end
|
227
208
|
end
|
228
209
|
|
210
|
+
def prepare_inline_token
|
211
|
+
str = ""
|
212
|
+
|
213
|
+
while str[-2,2] != "EI"
|
214
|
+
chr = @io.read(1)
|
215
|
+
break if chr.nil?
|
216
|
+
str << chr
|
217
|
+
end
|
218
|
+
|
219
|
+
@tokens << str[0, str.size-2].strip
|
220
|
+
@io.seek(-2, IO::SEEK_CUR) unless chr.nil?
|
221
|
+
end
|
222
|
+
|
223
|
+
# if we're currently inside a hex string, read hex nibbles until
|
224
|
+
# we find a closing >
|
225
|
+
#
|
226
|
+
def prepare_hex_token
|
227
|
+
str = ""
|
228
|
+
finished = false
|
229
|
+
|
230
|
+
while !finished
|
231
|
+
chr = @io.read(1)
|
232
|
+
codepoint = chr.to_s.unpack("C*").first
|
233
|
+
if chr.nil?
|
234
|
+
finished = true # unbalanced params
|
235
|
+
elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
|
236
|
+
str << chr
|
237
|
+
elsif codepoint <= 32
|
238
|
+
# ignore it
|
239
|
+
else
|
240
|
+
@tokens << str if str.size > 0
|
241
|
+
@tokens << ">" if chr != ">"
|
242
|
+
@tokens << chr
|
243
|
+
finished = true
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
229
248
|
# if we're currently inside a literal string we more or less just read bytes until
|
230
249
|
# we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
|
231
250
|
# start of a new token in regular mode are left untouched when inside a literal
|
@@ -243,10 +262,12 @@ class PDF::Reader
|
|
243
262
|
chr = @io.read(1)
|
244
263
|
if chr.nil?
|
245
264
|
count = 0 # unbalanced params
|
246
|
-
elsif chr == "
|
265
|
+
elsif chr == "\x5c"
|
266
|
+
str << chr << @io.read(1).to_s
|
267
|
+
elsif chr == "("
|
247
268
|
str << "("
|
248
269
|
count += 1
|
249
|
-
elsif chr == ")"
|
270
|
+
elsif chr == ")"
|
250
271
|
count -= 1
|
251
272
|
str << ")" unless count == 0
|
252
273
|
else
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -24,30 +24,31 @@
|
|
24
24
|
################################################################################
|
25
25
|
|
26
26
|
class PDF::Reader
|
27
|
-
class CMap
|
27
|
+
class CMap # :nodoc:
|
28
28
|
|
29
29
|
def initialize(data)
|
30
30
|
@map = {}
|
31
|
-
|
32
|
-
|
31
|
+
process_data(data)
|
32
|
+
end
|
33
|
+
|
34
|
+
def process_data(data)
|
35
|
+
mode = nil
|
33
36
|
instructions = ""
|
34
37
|
|
35
38
|
data.each_line do |l|
|
36
39
|
if l.include?("beginbfchar")
|
37
|
-
|
40
|
+
mode = :char
|
38
41
|
elsif l.include?("endbfchar")
|
39
42
|
process_bfchar_instructions(instructions)
|
40
43
|
instructions = ""
|
41
|
-
|
44
|
+
mode = nil
|
42
45
|
elsif l.include?("beginbfrange")
|
43
|
-
|
46
|
+
mode = :range
|
44
47
|
elsif l.include?("endbfrange")
|
45
48
|
process_bfrange_instructions(instructions)
|
46
49
|
instructions = ""
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
if !l.include?("begin") && (in_char_mode || in_range_mode)
|
50
|
+
mode = nil
|
51
|
+
elsif mode == :char || mode == :range
|
51
52
|
instructions << l
|
52
53
|
end
|
53
54
|
end
|