pdf-reader 0.8.2 → 0.8.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +4 -0
- data/Rakefile +15 -17
- data/lib/pdf/reader.rb +1 -2
- data/lib/pdf/reader/buffer.rb +34 -25
- metadata +5 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
v0.8.3 (14th February 2010)
|
2
|
+
- Fix a bug in tokenising of hex strings inside dictionaries
|
3
|
+
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
4
|
+
|
1
5
|
v0.8.2 (1st January 2010)
|
2
6
|
- Fix parsing of files that use Form XObjects behind an indirect reference
|
3
7
|
(thanks Cornelius Illi and Patrick Crosby)
|
data/Rakefile
CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
|
|
6
6
|
require "rake/gempackagetask"
|
7
7
|
require 'spec/rake/spectask'
|
8
8
|
|
9
|
-
PKG_VERSION = "0.8.
|
9
|
+
PKG_VERSION = "0.8.3"
|
10
10
|
PKG_NAME = "pdf-reader"
|
11
11
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
12
|
|
@@ -57,27 +57,25 @@ end
|
|
57
57
|
# RSpec files aren't included, as they depend on the PDF files,
|
58
58
|
# which will make the gem filesize irritatingly large
|
59
59
|
spec = Gem::Specification.new do |spec|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
["Rakefile"]
|
66
|
-
|
60
|
+
spec.name = PKG_NAME
|
61
|
+
spec.version = PKG_VERSION
|
62
|
+
spec.platform = Gem::Platform::RUBY
|
63
|
+
spec.summary = "A library for accessing the content of PDF files"
|
64
|
+
spec.files = Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
|
67
65
|
spec.require_path = "lib"
|
68
66
|
spec.bindir = "bin"
|
69
67
|
spec.executables << "pdf_object"
|
70
68
|
spec.executables << "pdf_text"
|
71
69
|
spec.executables << "pdf_list_callbacks"
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
spec.author = "
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
70
|
+
spec.has_rdoc = true
|
71
|
+
spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
|
72
|
+
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
73
|
+
'--main' << 'README.rdoc' << '-q'
|
74
|
+
spec.author = "James Healy"
|
75
|
+
spec.email = "jimmy@deefa.com"
|
76
|
+
spec.rubyforge_project = "pdf-reader"
|
77
|
+
spec.homepage = "http://github.com/yob/pdf-reader"
|
78
|
+
spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
|
81
79
|
spec.add_dependency('Ascii85', '>=0.9')
|
82
80
|
end
|
83
81
|
|
data/lib/pdf/reader.rb
CHANGED
@@ -137,8 +137,7 @@ class PDF::Reader
|
|
137
137
|
################################################################################
|
138
138
|
# Given an IO object that contains PDF data, return the contents of a single object
|
139
139
|
def object (io, id, gen)
|
140
|
-
@
|
141
|
-
@xref = XRef.new(@buffer)
|
140
|
+
@xref = XRef.new(io)
|
142
141
|
@xref.load
|
143
142
|
|
144
143
|
@xref.object(Reference.new(id, gen))
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -27,7 +27,7 @@
|
|
27
27
|
|
28
28
|
class PDF::Reader
|
29
29
|
|
30
|
-
# A string tokeniser that recognises PDF
|
30
|
+
# A string tokeniser that recognises PDF grammar. When passed an IO stream or a
|
31
31
|
# string, repeated calls to token() will return the next token from the source.
|
32
32
|
#
|
33
33
|
# This is very low level, and getting the raw tokens is not very useful in itself.
|
@@ -99,7 +99,7 @@ class PDF::Reader
|
|
99
99
|
end
|
100
100
|
|
101
101
|
# return raw bytes from the underlying IO stream. All bytes up to the first
|
102
|
-
#
|
102
|
+
# occurrence of needle will be returned. The match (if any) is not returned.
|
103
103
|
# The IO stream cursor is left on the first byte of the match.
|
104
104
|
#
|
105
105
|
# needle - a string to search the IO stream for
|
@@ -129,7 +129,7 @@ class PDF::Reader
|
|
129
129
|
reset_pos
|
130
130
|
prepare_tokens if @tokens.size < 3
|
131
131
|
merge_indirect_reference
|
132
|
-
|
132
|
+
prepare_tokens if @tokens.size < 3
|
133
133
|
|
134
134
|
@tokens.shift
|
135
135
|
end
|
@@ -206,10 +206,14 @@ class PDF::Reader
|
|
206
206
|
# them, replace the tokens with a PDF::Reader::Reference instance.
|
207
207
|
#
|
208
208
|
# Merging them into a single string was another option, but that would mean
|
209
|
-
# code further up the
|
209
|
+
# code further up the stack would need to check every token to see if it looks
|
210
210
|
# like an indirect object. For optimisation reasons, I'd rather avoid
|
211
211
|
# that extra check.
|
212
212
|
#
|
213
|
+
# It's incredibly likely that the next 3 tokens in the buffer are NOT an
|
214
|
+
# indirect reference, so test for that case first and avoid the relatively
|
215
|
+
# expensive regexp checks if possible.
|
216
|
+
#
|
213
217
|
def merge_indirect_reference
|
214
218
|
return if @tokens.size < 3
|
215
219
|
return if @tokens[2] != "R"
|
@@ -222,26 +226,8 @@ class PDF::Reader
|
|
222
226
|
end
|
223
227
|
end
|
224
228
|
|
225
|
-
# merge any consequtive tokens that are actually 1 token. The only current
|
226
|
-
# time this is the case is << and >>. < and > are valid tokens (they indicate
|
227
|
-
# a hex string) but so are << and >> (they indicate a dictionary).
|
228
|
-
#
|
229
|
-
def merge_tokens
|
230
|
-
@tokens.each_with_index do |tok, idx|
|
231
|
-
if tok == "<" && @tokens[idx+1] == "<"
|
232
|
-
@tokens.inspect
|
233
|
-
@tokens[idx] = "<<"
|
234
|
-
@tokens[idx+1] = nil
|
235
|
-
elsif tok == ">" && @tokens[idx+1] == ">"
|
236
|
-
@tokens[idx] = ">>"
|
237
|
-
@tokens[idx+1] = nil
|
238
|
-
end
|
239
|
-
end
|
240
|
-
@tokens.compact!
|
241
|
-
end
|
242
|
-
|
243
229
|
# if we're currently inside a literal string we more or less just read bytes until
|
244
|
-
# we find the
|
230
|
+
# we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
|
245
231
|
# start of a new token in regular mode are left untouched when inside a literal
|
246
232
|
# string.
|
247
233
|
#
|
@@ -294,13 +280,27 @@ class PDF::Reader
|
|
294
280
|
@tokens << tok if tok.size > 0
|
295
281
|
tok = ""
|
296
282
|
break
|
297
|
-
when "\
|
283
|
+
when "\x3C"
|
284
|
+
# opening delimiter '<', start of new token
|
285
|
+
@tokens << tok if tok.size > 0
|
286
|
+
chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
|
287
|
+
@tokens << chr
|
288
|
+
tok = ""
|
289
|
+
break
|
290
|
+
when "\x3E"
|
291
|
+
# closing delimiter '>', start of new token
|
292
|
+
@tokens << tok if tok.size > 0
|
293
|
+
chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
|
294
|
+
@tokens << chr
|
295
|
+
tok = ""
|
296
|
+
break
|
297
|
+
when "\x28", "\x5B", "\x7B", "\x2F"
|
298
298
|
# opening delimiter, start of new token
|
299
299
|
@tokens << tok if tok.size > 0
|
300
300
|
@tokens << chr
|
301
301
|
tok = ""
|
302
302
|
break
|
303
|
-
when "\x29", "\
|
303
|
+
when "\x29", "\x5D", "\x7D"
|
304
304
|
# closing delimiter
|
305
305
|
@tokens << tok if tok.size > 0
|
306
306
|
@tokens << chr
|
@@ -313,5 +313,14 @@ class PDF::Reader
|
|
313
313
|
|
314
314
|
@tokens << tok if tok.size > 0
|
315
315
|
end
|
316
|
+
|
317
|
+
# peek at the next character in the io stream, leaving the stream position
|
318
|
+
# untouched
|
319
|
+
#
|
320
|
+
def peek_char
|
321
|
+
chr = @io.read(1)
|
322
|
+
@io.seek(-1, IO::SEEK_CUR) unless chr.nil?
|
323
|
+
chr
|
324
|
+
end
|
316
325
|
end
|
317
326
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-02-14 00:00:00 +11:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -23,7 +23,7 @@ dependencies:
|
|
23
23
|
version: "0.9"
|
24
24
|
version:
|
25
25
|
description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
|
26
|
-
email:
|
26
|
+
email: jimmy@deefa.com
|
27
27
|
executables:
|
28
28
|
- pdf_object
|
29
29
|
- pdf_text
|
@@ -76,7 +76,7 @@ files:
|
|
76
76
|
- CHANGELOG
|
77
77
|
- MIT-LICENSE
|
78
78
|
has_rdoc: true
|
79
|
-
homepage: http://
|
79
|
+
homepage: http://github.com/yob/pdf-reader
|
80
80
|
licenses: []
|
81
81
|
|
82
82
|
post_install_message:
|