pdf-reader 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
+ v0.8.3 (14th February 2010)
2
+ - Fix a bug in tokenising of hex strings inside dictionaries
3
+ - Thanks to Brad Ediger for detecting the issue and proposing a solution
4
+
1
5
  v0.8.2 (1st January 2010)
2
6
  - Fix parsing of files that use Form XObjects behind an indirect reference
3
7
  (thanks Cornelius Illi and Patrick Crosby)
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.8.2"
9
+ PKG_VERSION = "0.8.3"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -57,27 +57,25 @@ end
57
57
  # RSpec files aren't included, as they depend on the PDF files,
58
58
  # which will make the gem filesize irritatingly large
59
59
  spec = Gem::Specification.new do |spec|
60
- spec.name = PKG_NAME
61
- spec.version = PKG_VERSION
62
- spec.platform = Gem::Platform::RUBY
63
- spec.summary = "A library for accessing the content of PDF files"
64
- spec.files = Dir.glob("{examples,lib}/**/**/*") +
65
- ["Rakefile"]
66
-
60
+ spec.name = PKG_NAME
61
+ spec.version = PKG_VERSION
62
+ spec.platform = Gem::Platform::RUBY
63
+ spec.summary = "A library for accessing the content of PDF files"
64
+ spec.files = Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
67
65
  spec.require_path = "lib"
68
66
  spec.bindir = "bin"
69
67
  spec.executables << "pdf_object"
70
68
  spec.executables << "pdf_text"
71
69
  spec.executables << "pdf_list_callbacks"
72
- spec.has_rdoc = true
73
- spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
74
- spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
75
- '--main' << 'README.rdoc' << '-q'
76
- spec.author = "Peter Jones"
77
- spec.email = "pjones@pmade.com"
78
- spec.rubyforge_project = "pdf-reader"
79
- spec.homepage = "http://software.pmade.com/pdfreader"
80
- spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
70
+ spec.has_rdoc = true
71
+ spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
72
+ spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
73
+ '--main' << 'README.rdoc' << '-q'
74
+ spec.author = "James Healy"
75
+ spec.email = "jimmy@deefa.com"
76
+ spec.rubyforge_project = "pdf-reader"
77
+ spec.homepage = "http://github.com/yob/pdf-reader"
78
+ spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
81
79
  spec.add_dependency('Ascii85', '>=0.9')
82
80
  end
83
81
 
@@ -137,8 +137,7 @@ class PDF::Reader
137
137
  ################################################################################
138
138
  # Given an IO object that contains PDF data, return the contents of a single object
139
139
  def object (io, id, gen)
140
- @buffer = Buffer.new(io)
141
- @xref = XRef.new(@buffer)
140
+ @xref = XRef.new(io)
142
141
  @xref.load
143
142
 
144
143
  @xref.object(Reference.new(id, gen))
@@ -27,7 +27,7 @@
27
27
 
28
28
  class PDF::Reader
29
29
 
30
- # A string tokeniser that recognises PDF grammer. When passed an IO stream or a
30
+ # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
31
31
  # string, repeated calls to token() will return the next token from the source.
32
32
  #
33
33
  # This is very low level, and getting the raw tokens is not very useful in itself.
@@ -99,7 +99,7 @@ class PDF::Reader
99
99
  end
100
100
 
101
101
  # return raw bytes from the underlying IO stream. All bytes up to the first
102
- # occurance of needle will be returned. The match (if any) is not returned.
102
+ # occurrence of needle will be returned. The match (if any) is not returned.
103
103
  # The IO stream cursor is left on the first byte of the match.
104
104
  #
105
105
  # needle - a string to search the IO stream for
@@ -129,7 +129,7 @@ class PDF::Reader
129
129
  reset_pos
130
130
  prepare_tokens if @tokens.size < 3
131
131
  merge_indirect_reference
132
- merge_tokens
132
+ prepare_tokens if @tokens.size < 3
133
133
 
134
134
  @tokens.shift
135
135
  end
@@ -206,10 +206,14 @@ class PDF::Reader
206
206
  # them, replace the tokens with a PDF::Reader::Reference instance.
207
207
  #
208
208
  # Merging them into a single string was another option, but that would mean
209
- # code further up the stact would need to check every token to see if it looks
209
+ # code further up the stack would need to check every token to see if it looks
210
210
  # like an indirect object. For optimisation reasons, I'd rather avoid
211
211
  # that extra check.
212
212
  #
213
+ # It's incredibly likely that the next 3 tokens in the buffer are NOT an
214
+ # indirect reference, so test for that case first and avoid the relatively
215
+ # expensive regexp checks if possible.
216
+ #
213
217
  def merge_indirect_reference
214
218
  return if @tokens.size < 3
215
219
  return if @tokens[2] != "R"
@@ -222,26 +226,8 @@ class PDF::Reader
222
226
  end
223
227
  end
224
228
 
225
- # merge any consequtive tokens that are actually 1 token. The only current
226
- # time this is the case is << and >>. < and > are valid tokens (they indicate
227
- # a hex string) but so are << and >> (they indicate a dictionary).
228
- #
229
- def merge_tokens
230
- @tokens.each_with_index do |tok, idx|
231
- if tok == "<" && @tokens[idx+1] == "<"
232
- @tokens.inspect
233
- @tokens[idx] = "<<"
234
- @tokens[idx+1] = nil
235
- elsif tok == ">" && @tokens[idx+1] == ">"
236
- @tokens[idx] = ">>"
237
- @tokens[idx+1] = nil
238
- end
239
- end
240
- @tokens.compact!
241
- end
242
-
243
229
  # if we're currently inside a literal string we more or less just read bytes until
244
- # we find the closes ) delimiter. Lots of bytes that would otherwise indicate the
230
+ # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
245
231
  # start of a new token in regular mode are left untouched when inside a literal
246
232
  # string.
247
233
  #
@@ -294,13 +280,27 @@ class PDF::Reader
294
280
  @tokens << tok if tok.size > 0
295
281
  tok = ""
296
282
  break
297
- when "\x28", "\x3C", "\x5B", "\x7B", "\x2F"
283
+ when "\x3C"
284
+ # opening delimiter '<', start of new token
285
+ @tokens << tok if tok.size > 0
286
+ chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
287
+ @tokens << chr
288
+ tok = ""
289
+ break
290
+ when "\x3E"
291
+ # closing delimiter '>', start of new token
292
+ @tokens << tok if tok.size > 0
293
+ chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
294
+ @tokens << chr
295
+ tok = ""
296
+ break
297
+ when "\x28", "\x5B", "\x7B", "\x2F"
298
298
  # opening delimiter, start of new token
299
299
  @tokens << tok if tok.size > 0
300
300
  @tokens << chr
301
301
  tok = ""
302
302
  break
303
- when "\x29", "\x3E", "\x5D", "\x7D"
303
+ when "\x29", "\x5D", "\x7D"
304
304
  # closing delimiter
305
305
  @tokens << tok if tok.size > 0
306
306
  @tokens << chr
@@ -313,5 +313,14 @@ class PDF::Reader
313
313
 
314
314
  @tokens << tok if tok.size > 0
315
315
  end
316
+
317
+ # peek at the next character in the io stream, leaving the stream position
318
+ # untouched
319
+ #
320
+ def peek_char
321
+ chr = @io.read(1)
322
+ @io.seek(-1, IO::SEEK_CUR) unless chr.nil?
323
+ chr
324
+ end
316
325
  end
317
326
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
- - Peter Jones
7
+ - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-01 00:00:00 +11:00
12
+ date: 2010-02-14 00:00:00 +11:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -23,7 +23,7 @@ dependencies:
23
23
  version: "0.9"
24
24
  version:
25
25
  description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
26
- email: pjones@pmade.com
26
+ email: jimmy@deefa.com
27
27
  executables:
28
28
  - pdf_object
29
29
  - pdf_text
@@ -76,7 +76,7 @@ files:
76
76
  - CHANGELOG
77
77
  - MIT-LICENSE
78
78
  has_rdoc: true
79
- homepage: http://software.pmade.com/pdfreader
79
+ homepage: http://github.com/yob/pdf-reader
80
80
  licenses: []
81
81
 
82
82
  post_install_message: