pdf-reader 0.8.6 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/lib/pdf/reader.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -24,6 +24,8 @@
|
|
24
24
|
################################################################################
|
25
25
|
|
26
26
|
require 'stringio'
|
27
|
+
require 'zlib'
|
28
|
+
|
27
29
|
require 'ascii85'
|
28
30
|
|
29
31
|
module PDF
|
@@ -37,77 +39,127 @@ module PDF
|
|
37
39
|
# on receivers.
|
38
40
|
#
|
39
41
|
# = Parsing a file
|
40
|
-
#
|
42
|
+
#
|
41
43
|
# PDF::Reader.file("somefile.pdf", receiver)
|
42
44
|
#
|
43
45
|
# = Parsing a String
|
44
|
-
#
|
46
|
+
#
|
45
47
|
# This is useful for processing a PDF that is already in memory
|
46
48
|
#
|
47
49
|
# PDF::Reader.string(pdf_string, receiver)
|
48
50
|
#
|
49
51
|
# = Parsing an IO object
|
50
|
-
#
|
52
|
+
#
|
51
53
|
# This can be a useful alternative to the first 2 options in some situations
|
52
54
|
#
|
53
55
|
# pdf = PDF::Reader.new
|
54
56
|
# pdf.parse(File.new("somefile.pdf"), receiver)
|
55
57
|
#
|
56
58
|
# = Parsing parts of a file
|
57
|
-
#
|
58
|
-
# Both PDF::Reader#file and PDF::Reader#string accept a 3 argument that specifies which
|
59
|
-
# parts of the file to process. By default, all options are enabled, so this can be useful
|
60
|
-
# to cut down processing time if you're only interested in say, metadata.
|
61
59
|
#
|
62
|
-
#
|
63
|
-
#
|
60
|
+
# Both PDF::Reader#file and PDF::Reader#string accept a third argument that
|
61
|
+
# specifies which parts of the file to process. By default, all options are
|
62
|
+
# enabled, so this can be useful to cut down processing time if you're only
|
63
|
+
# interested in say, metadata.
|
64
|
+
#
|
65
|
+
# As an example, the following call will disable parsing the contents of
|
66
|
+
# pages in the file, but explicitly enables processing metadata.
|
64
67
|
#
|
65
68
|
# PDF::Reader.new("somefile.pdf", receiver, {:metadata => true, :pages => false})
|
66
69
|
#
|
67
70
|
# Available options are currently:
|
68
|
-
#
|
71
|
+
#
|
69
72
|
# :metadata
|
70
73
|
# :pages
|
74
|
+
# :raw_text
|
75
|
+
#
|
71
76
|
class Reader
|
72
|
-
|
77
|
+
|
73
78
|
# Parse the file with the given name, sending events to the given receiver.
|
79
|
+
#
|
74
80
|
def self.file(name, receiver, opts = {})
|
75
81
|
File.open(name,"rb") do |f|
|
76
82
|
new.parse(f, receiver, opts)
|
77
83
|
end
|
78
84
|
end
|
79
|
-
|
85
|
+
|
80
86
|
# Parse the given string, sending events to the given receiver.
|
87
|
+
#
|
81
88
|
def self.string(str, receiver, opts = {})
|
82
89
|
StringIO.open(str) do |s|
|
83
90
|
new.parse(s, receiver, opts)
|
84
91
|
end
|
85
92
|
end
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
93
|
+
|
94
|
+
# Parse the file with the given name, returning an unmarshalled ruby version of
|
95
|
+
# represents the requested pdf object
|
96
|
+
#
|
97
|
+
def self.object_file(name, id, gen = 0)
|
98
|
+
File.open(name,"rb") { |f|
|
99
|
+
new.object(f, id.to_i, gen.to_i)
|
100
|
+
}
|
91
101
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
102
|
+
|
103
|
+
# Parse the given string, returning an unmarshalled ruby version of represents
|
104
|
+
# the requested pdf object
|
105
|
+
#
|
106
|
+
def self.object_string(str, id, gen = 0)
|
107
|
+
StringIO.open(str) { |s|
|
108
|
+
new.object(s, id.to_i, gen.to_i)
|
109
|
+
}
|
110
|
+
end
|
111
|
+
|
112
|
+
# Given an IO object that contains PDF data, parse it.
|
113
|
+
#
|
114
|
+
def parse(io, receiver, opts = {})
|
115
|
+
ohash = ObjectHash.new(io)
|
116
|
+
|
117
|
+
if ohash.trailer[:Encrypt]
|
118
|
+
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
119
|
+
end
|
120
|
+
|
121
|
+
options = {:pages => true, :raw_text => false, :metadata => true}
|
122
|
+
options.merge!(opts)
|
123
|
+
|
124
|
+
strategies.each do |s|
|
125
|
+
s.new(ohash, receiver, options).process
|
96
126
|
end
|
127
|
+
|
128
|
+
self
|
129
|
+
end
|
130
|
+
|
131
|
+
# Given an IO object that contains PDF data, return the contents of a single object
|
132
|
+
#
|
133
|
+
def object (io, id, gen)
|
134
|
+
@ohash = ObjectHash.new(io)
|
135
|
+
|
136
|
+
@ohash.object(Reference.new(id, gen))
|
137
|
+
end
|
138
|
+
|
139
|
+
private
|
140
|
+
|
141
|
+
def strategies
|
142
|
+
@strategies ||= [
|
143
|
+
PDF::Reader::MetadataStrategy,
|
144
|
+
PDF::Reader::PagesStrategy
|
145
|
+
]
|
97
146
|
end
|
98
|
-
################################################################################
|
99
147
|
end
|
100
|
-
################################################################################
|
101
148
|
end
|
102
149
|
################################################################################
|
103
|
-
|
150
|
+
|
151
|
+
require 'pdf/reader/abstract_strategy'
|
104
152
|
require 'pdf/reader/buffer'
|
105
153
|
require 'pdf/reader/cmap'
|
106
|
-
require 'pdf/reader/content'
|
107
154
|
require 'pdf/reader/encoding'
|
108
155
|
require 'pdf/reader/error'
|
109
156
|
require 'pdf/reader/filter'
|
110
157
|
require 'pdf/reader/font'
|
158
|
+
require 'pdf/reader/lzw'
|
159
|
+
require 'pdf/reader/metadata_strategy'
|
160
|
+
require 'pdf/reader/object_hash'
|
161
|
+
require 'pdf/reader/object_stream'
|
162
|
+
require 'pdf/reader/pages_strategy'
|
111
163
|
require 'pdf/reader/parser'
|
112
164
|
require 'pdf/reader/print_receiver'
|
113
165
|
require 'pdf/reader/reference'
|
@@ -117,31 +169,3 @@ require 'pdf/reader/text_receiver'
|
|
117
169
|
require 'pdf/reader/token'
|
118
170
|
require 'pdf/reader/xref'
|
119
171
|
require 'pdf/hash'
|
120
|
-
|
121
|
-
class PDF::Reader
|
122
|
-
################################################################################
|
123
|
-
# Given an IO object that contains PDF data, parse it.
|
124
|
-
def parse (io, receiver, opts = {})
|
125
|
-
@xref = XRef.new(io)
|
126
|
-
@content = (receiver == Explore ? Explore : Content).new(receiver, @xref)
|
127
|
-
|
128
|
-
options = {:pages => true, :metadata => true}
|
129
|
-
options.merge!(opts)
|
130
|
-
|
131
|
-
trailer = @xref.load
|
132
|
-
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
|
133
|
-
@content.metadata(@xref.object(trailer[:Root]), @xref.object(trailer[:Info])) if options[:metadata]
|
134
|
-
@content.document(@xref.object(trailer[:Root])) if options[:pages]
|
135
|
-
self
|
136
|
-
end
|
137
|
-
################################################################################
|
138
|
-
# Given an IO object that contains PDF data, return the contents of a single object
|
139
|
-
def object (io, id, gen)
|
140
|
-
@xref = XRef.new(io)
|
141
|
-
@xref.load
|
142
|
-
|
143
|
-
@xref.object(Reference.new(id, gen))
|
144
|
-
end
|
145
|
-
################################################################################
|
146
|
-
end
|
147
|
-
################################################################################
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
class AbstractStrategy # :nodoc:
|
6
|
+
|
7
|
+
def initialize(ohash, receiver, options = {})
|
8
|
+
@ohash, @receiver, @options = ohash, receiver, options
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def options
|
14
|
+
@options || {}
|
15
|
+
end
|
16
|
+
|
17
|
+
# calls the name callback method on the receiver class with params as the arguments
|
18
|
+
#
|
19
|
+
def callback (name, params=[])
|
20
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
21
|
+
end
|
22
|
+
|
23
|
+
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
24
|
+
def decode_strings(obj)
|
25
|
+
case obj
|
26
|
+
when String then
|
27
|
+
if obj[0,2].unpack("C*").slice(0,2) == [254,255]
|
28
|
+
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
29
|
+
else
|
30
|
+
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
31
|
+
end
|
32
|
+
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
33
|
+
when Array then obj.collect { |item| decode_strings(item) }
|
34
|
+
else
|
35
|
+
obj
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def info
|
40
|
+
ohash.object(trailer[:Info])
|
41
|
+
end
|
42
|
+
|
43
|
+
def info?
|
44
|
+
info ? true : false
|
45
|
+
end
|
46
|
+
|
47
|
+
def ohash
|
48
|
+
@ohash
|
49
|
+
end
|
50
|
+
|
51
|
+
def pages
|
52
|
+
ohash.object(root[:Pages])
|
53
|
+
end
|
54
|
+
|
55
|
+
def pages?
|
56
|
+
pages ? true : false
|
57
|
+
end
|
58
|
+
|
59
|
+
def receiver
|
60
|
+
@receiver
|
61
|
+
end
|
62
|
+
|
63
|
+
def root
|
64
|
+
ohash.object(trailer[:Root])
|
65
|
+
end
|
66
|
+
|
67
|
+
def root?
|
68
|
+
root ? true : false
|
69
|
+
end
|
70
|
+
|
71
|
+
def trailer
|
72
|
+
ohash.trailer
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -48,11 +48,13 @@ class PDF::Reader
|
|
48
48
|
# options:
|
49
49
|
#
|
50
50
|
# :seek - a byte offset to seek to before starting to tokenise
|
51
|
+
# :content_stream - set to true if buffer will be tokenising a
|
52
|
+
# content stream. Defaults to false
|
51
53
|
#
|
52
54
|
def initialize (io, opts = {})
|
53
55
|
@io = io
|
54
56
|
@tokens = []
|
55
|
-
@
|
57
|
+
@in_content_stream = opts[:content_stream]
|
56
58
|
|
57
59
|
@io.seek(opts[:seek]) if opts[:seek]
|
58
60
|
@pos = @io.pos
|
@@ -98,30 +100,6 @@ class PDF::Reader
|
|
98
100
|
bytes
|
99
101
|
end
|
100
102
|
|
101
|
-
# return raw bytes from the underlying IO stream. All bytes up to the first
|
102
|
-
# occurrence of needle will be returned. The match (if any) is not returned.
|
103
|
-
# The IO stream cursor is left on the first byte of the match.
|
104
|
-
#
|
105
|
-
# needle - a string to search the IO stream for
|
106
|
-
#
|
107
|
-
def read_until(needle)
|
108
|
-
reset_pos
|
109
|
-
out = ""
|
110
|
-
size = needle.size
|
111
|
-
|
112
|
-
while out[size * -1, size] != needle && !@io.eof?
|
113
|
-
out << @io.read(1)
|
114
|
-
end
|
115
|
-
|
116
|
-
if out[size * -1, size] == needle
|
117
|
-
out = out[0, out.size - size]
|
118
|
-
@io.seek(size * -1, IO::SEEK_CUR)
|
119
|
-
end
|
120
|
-
|
121
|
-
save_pos
|
122
|
-
out
|
123
|
-
end
|
124
|
-
|
125
103
|
# return the next token from the source. Returns a string if a token
|
126
104
|
# is found, nil if there are no tokens left.
|
127
105
|
#
|
@@ -141,19 +119,8 @@ class PDF::Reader
|
|
141
119
|
data = @io.read(1024)
|
142
120
|
|
143
121
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
144
|
-
|
145
|
-
|
146
|
-
data = data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")
|
147
|
-
lines = data.split(/\n/).reverse
|
148
|
-
|
149
|
-
eof_index = nil
|
150
|
-
|
151
|
-
lines.each_with_index do |line, index|
|
152
|
-
if line =~ /^%%EOF\r?$/
|
153
|
-
eof_index = index
|
154
|
-
break
|
155
|
-
end
|
156
|
-
end
|
122
|
+
lines = data.split(/[\n\r]+/).reverse
|
123
|
+
eof_index = lines.index { |l| l.strip == "%%EOF" }
|
157
124
|
|
158
125
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
159
126
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
@@ -162,6 +129,12 @@ class PDF::Reader
|
|
162
129
|
|
163
130
|
private
|
164
131
|
|
132
|
+
# Returns true if this buffer is parsing a content stream
|
133
|
+
#
|
134
|
+
def in_content_stream?
|
135
|
+
@in_content_stream ? true : false
|
136
|
+
end
|
137
|
+
|
165
138
|
# Some bastard moved our IO stream cursor. Restore it.
|
166
139
|
#
|
167
140
|
def reset_pos
|
@@ -181,8 +154,12 @@ class PDF::Reader
|
|
181
154
|
10.times do
|
182
155
|
if state == :literal_string
|
183
156
|
prepare_literal_token
|
157
|
+
elsif state == :hex_string
|
158
|
+
prepare_hex_token
|
184
159
|
elsif state == :regular
|
185
160
|
prepare_regular_token
|
161
|
+
elsif state == :inline
|
162
|
+
prepare_inline_token
|
186
163
|
end
|
187
164
|
end
|
188
165
|
|
@@ -195,8 +172,12 @@ class PDF::Reader
|
|
195
172
|
def state
|
196
173
|
if @tokens[-1] == "("
|
197
174
|
:literal_string
|
175
|
+
elsif @tokens[-1] == "<"
|
176
|
+
:hex_string
|
198
177
|
elsif @tokens[-1] == "stream"
|
199
178
|
:stream
|
179
|
+
elsif in_content_stream? && @tokens[-1] == "ID"
|
180
|
+
:inline
|
200
181
|
else
|
201
182
|
:regular
|
202
183
|
end
|
@@ -226,6 +207,44 @@ class PDF::Reader
|
|
226
207
|
end
|
227
208
|
end
|
228
209
|
|
210
|
+
def prepare_inline_token
|
211
|
+
str = ""
|
212
|
+
|
213
|
+
while str[-2,2] != "EI"
|
214
|
+
chr = @io.read(1)
|
215
|
+
break if chr.nil?
|
216
|
+
str << chr
|
217
|
+
end
|
218
|
+
|
219
|
+
@tokens << str[0, str.size-2].strip
|
220
|
+
@io.seek(-2, IO::SEEK_CUR) unless chr.nil?
|
221
|
+
end
|
222
|
+
|
223
|
+
# if we're currently inside a hex string, read hex nibbles until
|
224
|
+
# we find a closing >
|
225
|
+
#
|
226
|
+
def prepare_hex_token
|
227
|
+
str = ""
|
228
|
+
finished = false
|
229
|
+
|
230
|
+
while !finished
|
231
|
+
chr = @io.read(1)
|
232
|
+
codepoint = chr.to_s.unpack("C*").first
|
233
|
+
if chr.nil?
|
234
|
+
finished = true # unbalanced params
|
235
|
+
elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
|
236
|
+
str << chr
|
237
|
+
elsif codepoint <= 32
|
238
|
+
# ignore it
|
239
|
+
else
|
240
|
+
@tokens << str if str.size > 0
|
241
|
+
@tokens << ">" if chr != ">"
|
242
|
+
@tokens << chr
|
243
|
+
finished = true
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
229
248
|
# if we're currently inside a literal string we more or less just read bytes until
|
230
249
|
# we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
|
231
250
|
# start of a new token in regular mode are left untouched when inside a literal
|
@@ -243,10 +262,12 @@ class PDF::Reader
|
|
243
262
|
chr = @io.read(1)
|
244
263
|
if chr.nil?
|
245
264
|
count = 0 # unbalanced params
|
246
|
-
elsif chr == "
|
265
|
+
elsif chr == "\x5c"
|
266
|
+
str << chr << @io.read(1).to_s
|
267
|
+
elsif chr == "("
|
247
268
|
str << "("
|
248
269
|
count += 1
|
249
|
-
elsif chr == ")"
|
270
|
+
elsif chr == ")"
|
250
271
|
count -= 1
|
251
272
|
str << ")" unless count == 0
|
252
273
|
else
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -24,30 +24,31 @@
|
|
24
24
|
################################################################################
|
25
25
|
|
26
26
|
class PDF::Reader
|
27
|
-
class CMap
|
27
|
+
class CMap # :nodoc:
|
28
28
|
|
29
29
|
def initialize(data)
|
30
30
|
@map = {}
|
31
|
-
|
32
|
-
|
31
|
+
process_data(data)
|
32
|
+
end
|
33
|
+
|
34
|
+
def process_data(data)
|
35
|
+
mode = nil
|
33
36
|
instructions = ""
|
34
37
|
|
35
38
|
data.each_line do |l|
|
36
39
|
if l.include?("beginbfchar")
|
37
|
-
|
40
|
+
mode = :char
|
38
41
|
elsif l.include?("endbfchar")
|
39
42
|
process_bfchar_instructions(instructions)
|
40
43
|
instructions = ""
|
41
|
-
|
44
|
+
mode = nil
|
42
45
|
elsif l.include?("beginbfrange")
|
43
|
-
|
46
|
+
mode = :range
|
44
47
|
elsif l.include?("endbfrange")
|
45
48
|
process_bfrange_instructions(instructions)
|
46
49
|
instructions = ""
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
if !l.include?("begin") && (in_char_mode || in_range_mode)
|
50
|
+
mode = nil
|
51
|
+
elsif mode == :char || mode == :range
|
51
52
|
instructions << l
|
52
53
|
end
|
53
54
|
end
|