pdf-reader 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +4 -0
- data/Rakefile +15 -17
- data/lib/pdf/reader.rb +1 -2
- data/lib/pdf/reader/buffer.rb +34 -25
- metadata +5 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
v0.8.3 (14th February 2010)
|
2
|
+
- Fix a bug in tokenising of hex strings inside dictionaries
|
3
|
+
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
4
|
+
|
1
5
|
v0.8.2 (1st January 2010)
|
2
6
|
- Fix parsing of files that use Form XObjects behind an indirect reference
|
3
7
|
(thanks Cornelius Illi and Patrick Crosby)
|
data/Rakefile
CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
|
|
6
6
|
require "rake/gempackagetask"
|
7
7
|
require 'spec/rake/spectask'
|
8
8
|
|
9
|
-
PKG_VERSION = "0.8.
|
9
|
+
PKG_VERSION = "0.8.3"
|
10
10
|
PKG_NAME = "pdf-reader"
|
11
11
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
12
|
|
@@ -57,27 +57,25 @@ end
|
|
57
57
|
# RSpec files aren't included, as they depend on the PDF files,
|
58
58
|
# which will make the gem filesize irritatingly large
|
59
59
|
spec = Gem::Specification.new do |spec|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
["Rakefile"]
|
66
|
-
|
60
|
+
spec.name = PKG_NAME
|
61
|
+
spec.version = PKG_VERSION
|
62
|
+
spec.platform = Gem::Platform::RUBY
|
63
|
+
spec.summary = "A library for accessing the content of PDF files"
|
64
|
+
spec.files = Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
|
67
65
|
spec.require_path = "lib"
|
68
66
|
spec.bindir = "bin"
|
69
67
|
spec.executables << "pdf_object"
|
70
68
|
spec.executables << "pdf_text"
|
71
69
|
spec.executables << "pdf_list_callbacks"
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
spec.author = "
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
70
|
+
spec.has_rdoc = true
|
71
|
+
spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
|
72
|
+
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
73
|
+
'--main' << 'README.rdoc' << '-q'
|
74
|
+
spec.author = "James Healy"
|
75
|
+
spec.email = "jimmy@deefa.com"
|
76
|
+
spec.rubyforge_project = "pdf-reader"
|
77
|
+
spec.homepage = "http://github.com/yob/pdf-reader"
|
78
|
+
spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
|
81
79
|
spec.add_dependency('Ascii85', '>=0.9')
|
82
80
|
end
|
83
81
|
|
data/lib/pdf/reader.rb
CHANGED
@@ -137,8 +137,7 @@ class PDF::Reader
|
|
137
137
|
################################################################################
|
138
138
|
# Given an IO object that contains PDF data, return the contents of a single object
|
139
139
|
def object (io, id, gen)
|
140
|
-
@
|
141
|
-
@xref = XRef.new(@buffer)
|
140
|
+
@xref = XRef.new(io)
|
142
141
|
@xref.load
|
143
142
|
|
144
143
|
@xref.object(Reference.new(id, gen))
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -27,7 +27,7 @@
|
|
27
27
|
|
28
28
|
class PDF::Reader
|
29
29
|
|
30
|
-
# A string tokeniser that recognises PDF
|
30
|
+
# A string tokeniser that recognises PDF grammar. When passed an IO stream or a
|
31
31
|
# string, repeated calls to token() will return the next token from the source.
|
32
32
|
#
|
33
33
|
# This is very low level, and getting the raw tokens is not very useful in itself.
|
@@ -99,7 +99,7 @@ class PDF::Reader
|
|
99
99
|
end
|
100
100
|
|
101
101
|
# return raw bytes from the underlying IO stream. All bytes up to the first
|
102
|
-
#
|
102
|
+
# occurrence of needle will be returned. The match (if any) is not returned.
|
103
103
|
# The IO stream cursor is left on the first byte of the match.
|
104
104
|
#
|
105
105
|
# needle - a string to search the IO stream for
|
@@ -129,7 +129,7 @@ class PDF::Reader
|
|
129
129
|
reset_pos
|
130
130
|
prepare_tokens if @tokens.size < 3
|
131
131
|
merge_indirect_reference
|
132
|
-
|
132
|
+
prepare_tokens if @tokens.size < 3
|
133
133
|
|
134
134
|
@tokens.shift
|
135
135
|
end
|
@@ -206,10 +206,14 @@ class PDF::Reader
|
|
206
206
|
# them, replace the tokens with a PDF::Reader::Reference instance.
|
207
207
|
#
|
208
208
|
# Merging them into a single string was another option, but that would mean
|
209
|
-
# code further up the
|
209
|
+
# code further up the stack would need to check every token to see if it looks
|
210
210
|
# like an indirect object. For optimisation reasons, I'd rather avoid
|
211
211
|
# that extra check.
|
212
212
|
#
|
213
|
+
# It's incredibly likely that the next 3 tokens in the buffer are NOT an
|
214
|
+
# indirect reference, so test for that case first and avoid the relatively
|
215
|
+
# expensive regexp checks if possible.
|
216
|
+
#
|
213
217
|
def merge_indirect_reference
|
214
218
|
return if @tokens.size < 3
|
215
219
|
return if @tokens[2] != "R"
|
@@ -222,26 +226,8 @@ class PDF::Reader
|
|
222
226
|
end
|
223
227
|
end
|
224
228
|
|
225
|
-
# merge any consequtive tokens that are actually 1 token. The only current
|
226
|
-
# time this is the case is << and >>. < and > are valid tokens (they indicate
|
227
|
-
# a hex string) but so are << and >> (they indicate a dictionary).
|
228
|
-
#
|
229
|
-
def merge_tokens
|
230
|
-
@tokens.each_with_index do |tok, idx|
|
231
|
-
if tok == "<" && @tokens[idx+1] == "<"
|
232
|
-
@tokens.inspect
|
233
|
-
@tokens[idx] = "<<"
|
234
|
-
@tokens[idx+1] = nil
|
235
|
-
elsif tok == ">" && @tokens[idx+1] == ">"
|
236
|
-
@tokens[idx] = ">>"
|
237
|
-
@tokens[idx+1] = nil
|
238
|
-
end
|
239
|
-
end
|
240
|
-
@tokens.compact!
|
241
|
-
end
|
242
|
-
|
243
229
|
# if we're currently inside a literal string we more or less just read bytes until
|
244
|
-
# we find the
|
230
|
+
# we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
|
245
231
|
# start of a new token in regular mode are left untouched when inside a literal
|
246
232
|
# string.
|
247
233
|
#
|
@@ -294,13 +280,27 @@ class PDF::Reader
|
|
294
280
|
@tokens << tok if tok.size > 0
|
295
281
|
tok = ""
|
296
282
|
break
|
297
|
-
when "\
|
283
|
+
when "\x3C"
|
284
|
+
# opening delimiter '<', start of new token
|
285
|
+
@tokens << tok if tok.size > 0
|
286
|
+
chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
|
287
|
+
@tokens << chr
|
288
|
+
tok = ""
|
289
|
+
break
|
290
|
+
when "\x3E"
|
291
|
+
# closing delimiter '>', start of new token
|
292
|
+
@tokens << tok if tok.size > 0
|
293
|
+
chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
|
294
|
+
@tokens << chr
|
295
|
+
tok = ""
|
296
|
+
break
|
297
|
+
when "\x28", "\x5B", "\x7B", "\x2F"
|
298
298
|
# opening delimiter, start of new token
|
299
299
|
@tokens << tok if tok.size > 0
|
300
300
|
@tokens << chr
|
301
301
|
tok = ""
|
302
302
|
break
|
303
|
-
when "\x29", "\
|
303
|
+
when "\x29", "\x5D", "\x7D"
|
304
304
|
# closing delimiter
|
305
305
|
@tokens << tok if tok.size > 0
|
306
306
|
@tokens << chr
|
@@ -313,5 +313,14 @@ class PDF::Reader
|
|
313
313
|
|
314
314
|
@tokens << tok if tok.size > 0
|
315
315
|
end
|
316
|
+
|
317
|
+
# peek at the next character in the io stream, leaving the stream position
|
318
|
+
# untouched
|
319
|
+
#
|
320
|
+
def peek_char
|
321
|
+
chr = @io.read(1)
|
322
|
+
@io.seek(-1, IO::SEEK_CUR) unless chr.nil?
|
323
|
+
chr
|
324
|
+
end
|
316
325
|
end
|
317
326
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-02-14 00:00:00 +11:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -23,7 +23,7 @@ dependencies:
|
|
23
23
|
version: "0.9"
|
24
24
|
version:
|
25
25
|
description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
|
26
|
-
email:
|
26
|
+
email: jimmy@deefa.com
|
27
27
|
executables:
|
28
28
|
- pdf_object
|
29
29
|
- pdf_text
|
@@ -76,7 +76,7 @@ files:
|
|
76
76
|
- CHANGELOG
|
77
77
|
- MIT-LICENSE
|
78
78
|
has_rdoc: true
|
79
|
-
homepage: http://
|
79
|
+
homepage: http://github.com/yob/pdf-reader
|
80
80
|
licenses: []
|
81
81
|
|
82
82
|
post_install_message:
|