pdf-reader 0.8.3 → 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ v0.8.4 (XXX)
2
+ - fix parsing of files that use Form XObjects
3
+ - thanks to Andrea Barisani for reporting the issue
4
+ - fix two issues that caused a small number of characters to convert to Unicode
5
+ incorrectly
6
+ - thanks to Andrea Barisani for reporting the issue
7
+ - require 'pdf-reader' now works a well as 'pdf/reader'
8
+ - good practice to have the require file match the gem name
9
+ - thanks to Chris O'Meara for highlighting this
10
+
1
11
  v0.8.3 (14th February 2010)
2
12
  - Fix a bug in tokenising of hex strings inside dictionaries
3
13
  - Thanks to Brad Ediger for detecting the issue and proposing a solution
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.8.3"
9
+ PKG_VERSION = "0.8.4"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -29,6 +29,7 @@ class PageTextReceiver
29
29
 
30
30
  def show_text_with_positioning(*params)
31
31
  params = params.first
32
+ params ||= []
32
33
  params.each { |str| show_text(str) if str.kind_of?(String)}
33
34
  end
34
35
  end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Determine the PDF version of a file
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class VersionReceiver
10
+ attr_accessor :version
11
+
12
+ def initialize
13
+ @version = nil
14
+ end
15
+
16
+ # Called when document parsing starts
17
+ def pdf_version(arg = nil)
18
+ @version = arg
19
+ end
20
+
21
+ end
22
+
23
+ receiver = VersionReceiver.new
24
+ pdf = PDF::Reader.file(ARGV.shift, receiver)
25
+ puts receiver.version
@@ -0,0 +1 @@
1
+ require "pdf/reader"
@@ -9,10 +9,10 @@
9
9
  # distribute, sublicense, and/or sell copies of the Software, and to
10
10
  # permit persons to whom the Software is furnished to do so, subject to
11
11
  # the following conditions:
12
- #
12
+ #
13
13
  # The above copyright notice and this permission notice shall be
14
14
  # included in all copies or substantial portions of the Software.
15
- #
15
+ #
16
16
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
17
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
18
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -30,26 +30,33 @@ class PDF::Reader
30
30
  @map = {}
31
31
  in_char_mode = false
32
32
  in_range_mode = false
33
+ instructions = ""
33
34
 
34
35
  data.each_line do |l|
35
36
  if l.include?("beginbfchar")
36
- in_char_mode = true
37
+ in_char_mode = true
37
38
  elsif l.include?("endbfchar")
38
- in_char_mode = false
39
+ process_bfchar_instructions(instructions)
40
+ instructions = ""
41
+ in_char_mode = false
39
42
  elsif l.include?("beginbfrange")
40
- in_range_mode = true
43
+ in_range_mode = true
41
44
  elsif l.include?("endbfrange")
42
- in_range_mode = false
45
+ process_bfrange_instructions(instructions)
46
+ instructions = ""
47
+ in_range_mode = false
43
48
  end
44
49
 
45
- if in_char_mode
46
- process_bfchar_line(l)
47
- elsif in_range_mode
48
- process_bfrange_line(l)
50
+ if !l.include?("begin") && (in_char_mode || in_range_mode)
51
+ instructions << l
49
52
  end
50
53
  end
51
54
  end
52
55
 
56
+ def size
57
+ @map.size
58
+ end
59
+
53
60
  def decode(c)
54
61
  # TODO: implement the conversion
55
62
  return c unless c.class == Fixnum
@@ -58,24 +65,72 @@ class PDF::Reader
58
65
 
59
66
  private
60
67
 
61
- def process_bfchar_line(l)
62
- m, find, replace = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
63
- @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
68
+ def build_parser(instructions)
69
+ buffer = Buffer.new(StringIO.new(instructions))
70
+ Parser.new(buffer)
71
+ end
72
+
73
+ def str_to_int(str)
74
+ return nil if str.nil? || str.size == 0 || str.size >= 3
75
+
76
+ if str.size == 1
77
+ str.unpack("C*")[0]
78
+ else
79
+ str.unpack("n*")[0]
80
+ end
64
81
  end
65
82
 
66
- def process_bfrange_line(l)
67
- m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
68
- if start_code && end_code && dst
69
- start_code = "0x#{start_code}".hex
70
- end_code = "0x#{end_code}".hex
71
- dst = "0x#{dst}".hex
72
-
73
- # add all values in the range to our mapping
74
- (start_code..end_code).each_with_index do |val, idx|
75
- @map[val] = dst + idx
76
- # ensure a single range does not exceed 255 chars
77
- raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
83
+ def process_bfchar_instructions(instructions)
84
+ parser = build_parser(instructions)
85
+ find = str_to_int(parser.parse_token)
86
+ replace = str_to_int(parser.parse_token)
87
+ while find && replace
88
+ @map[find] = replace
89
+ find = str_to_int(parser.parse_token)
90
+ replace = str_to_int(parser.parse_token)
91
+ end
92
+ end
93
+
94
+ def process_bfrange_instructions(instructions)
95
+ parser = build_parser(instructions)
96
+ start = parser.parse_token
97
+ finish = parser.parse_token
98
+ to = parser.parse_token
99
+ while start && finish && to
100
+ if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
101
+ bfrange_type_one(start, finish, to)
102
+ elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
103
+ bfrange_type_two(start, finish, to)
104
+ else
105
+ raise "invalid bfrange section"
78
106
  end
107
+ start = parser.parse_token
108
+ finish = parser.parse_token
109
+ to = parser.parse_token
110
+ end
111
+ end
112
+
113
+ def bfrange_type_one(start_code, end_code, dst)
114
+ start_code = str_to_int(start_code)
115
+ end_code = str_to_int(end_code)
116
+ dst = str_to_int(dst)
117
+
118
+ # add all values in the range to our mapping
119
+ (start_code..end_code).each_with_index do |val, idx|
120
+ @map[val] = dst + idx
121
+ # ensure a single range does not exceed 255 chars
122
+ raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
123
+ end
124
+ end
125
+
126
+ def bfrange_type_two(start_code, end_code, dst)
127
+ start_code = str_to_int(start_code)
128
+ end_code = str_to_int(end_code)
129
+ from_range = (start_code..end_code)
130
+
131
+ # add all values in the range to our mapping
132
+ from_range.each_with_index do |val, idx|
133
+ @map[val] = str_to_int(dst[idx])
79
134
  end
80
135
  end
81
136
  end
@@ -251,7 +251,6 @@ class PDF::Reader
251
251
  def initialize (receiver, xref)
252
252
  @receiver = receiver
253
253
  @xref = xref
254
- @fonts ||= {}
255
254
  end
256
255
  ################################################################################
257
256
  # Begin processing the document metadata
@@ -309,10 +308,14 @@ class PDF::Reader
309
308
  contents = [page[:Contents]]
310
309
  end
311
310
 
312
- contents.each do |content|
313
- obj = @xref.object(content)
314
- content_stream(obj)
315
- end if page.has_key?(:Contents) and page[:Contents]
311
+ fonts = font_hash_from_resources(current_resources)
312
+
313
+ if page.has_key?(:Contents) and page[:Contents]
314
+ contents.each do |content|
315
+ obj = @xref.object(content)
316
+ content_stream(obj, fonts)
317
+ end
318
+ end
316
319
 
317
320
  resources.pop if res
318
321
  callback(:end_page)
@@ -330,7 +333,8 @@ class PDF::Reader
330
333
  callback(:begin_form_xobject)
331
334
  resources = @xref.object(xobject.hash[:Resources])
332
335
  walk_resources(resources) if resources
333
- content_stream(xobject)
336
+ fonts = font_hash_from_resources(resources)
337
+ content_stream(xobject, fonts)
334
338
  callback(:end_form_xobject)
335
339
  end
336
340
  end
@@ -348,42 +352,43 @@ class PDF::Reader
348
352
  ################################################################################
349
353
  # Reads a PDF content stream and calls all the appropriate callback methods for the operators
350
354
  # it contains
351
- def content_stream (instructions)
355
+ def content_stream (instructions, fonts = {})
352
356
  instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
353
- @buffer = Buffer.new(StringIO.new(instructions))
354
- @parser = Parser.new(@buffer, @xref)
355
- @params ||= []
357
+ buffer = Buffer.new(StringIO.new(instructions))
358
+ parser = Parser.new(buffer, @xref)
359
+ current_font = nil
360
+ params = []
356
361
 
357
- while (token = @parser.parse_token(OPERATORS))
362
+ while (token = parser.parse_token(OPERATORS))
358
363
  if token.kind_of?(Token) and OPERATORS.has_key?(token)
359
- @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
364
+ current_font = params.first if OPERATORS[token] == :set_text_font_and_size
360
365
 
361
366
  # handle special cases in response to certain operators
362
- if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
367
+ if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
363
368
  # convert any text to utf-8
364
- @params = @fonts[@current_font].to_utf8(@params)
369
+ params = fonts[current_font].to_utf8(params)
365
370
  elsif token == "ID"
366
371
  # inline image data, first convert the current params into a more familiar hash
367
372
  map = {}
368
- @params.each_slice(2) do |a|
373
+ params.each_slice(2) do |a|
369
374
  map[a.first] = a.last
370
375
  end
371
- @params = [map]
376
+ params = [map]
372
377
  # read the raw image data from the buffer without tokenising
373
- @params << @buffer.read_until("EI")
378
+ params << buffer.read_until("EI")
374
379
  end
375
380
 
376
- callback(OPERATORS[token], @params)
381
+ callback(OPERATORS[token], params)
377
382
 
378
383
  if OPERATORS[token] == :invoke_xobject
379
- xobject_label = @params.first
380
- @params.clear
384
+ xobject_label = params.first
385
+ params.clear
381
386
  walk_xobject_form(xobject_label)
382
387
  else
383
- @params.clear
388
+ params.clear
384
389
  end
385
390
  else
386
- @params << token
391
+ params << token
387
392
  end
388
393
  end
389
394
  rescue EOFError => e
@@ -430,24 +435,9 @@ class PDF::Reader
430
435
 
431
436
  # extract any font information
432
437
  if resources[:Font]
433
- @xref.object(resources[:Font]).each do |label, desc|
434
- desc = @xref.object(desc)
435
- @fonts[label] = PDF::Reader::Font.new
436
- @fonts[label].label = label
437
- @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
438
- @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
439
- @fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
440
- @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
441
- if desc[:ToUnicode]
442
- # this stream is a cmap
443
- begin
444
- stream = desc[:ToUnicode]
445
- @fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
446
- rescue
447
- # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
448
- end
449
- end
450
- callback(:resource_font, [label, @fonts[label]])
438
+ fonts = font_hash_from_resources(resources)
439
+ fonts.each do |label, font|
440
+ callback(:resource_font, [label, fonts])
451
441
  end
452
442
  end
453
443
  end
@@ -473,6 +463,32 @@ class PDF::Reader
473
463
  end
474
464
  ################################################################################
475
465
  private
466
+ ################################################################################
467
+ def font_hash_from_resources(resources)
468
+ return {} unless resources.respond_to?(:[])
469
+
470
+ fonts = {}
471
+ resources = @xref.object(resources[:Font]) || {}
472
+ resources.each do |label, desc|
473
+ desc = @xref.object(desc)
474
+ fonts[label] = PDF::Reader::Font.new
475
+ fonts[label].label = label
476
+ fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
477
+ fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
478
+ fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
479
+ fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
480
+ if desc[:ToUnicode]
481
+ # this stream is a cmap
482
+ begin
483
+ stream = desc[:ToUnicode]
484
+ fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
485
+ rescue
486
+ # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
487
+ end
488
+ end
489
+ end
490
+ fonts
491
+ end
476
492
  # strings outside of page content should be in either PDFDocEncoding or UTF-16.
477
493
  def decode_strings(obj)
478
494
  case obj
@@ -30,9 +30,11 @@ class PDF::Reader
30
30
 
31
31
  UNKNOWN_CHAR = 0x25AF # ▯
32
32
 
33
- attr_reader :differences
33
+ attr_reader :differences, :unpack
34
34
 
35
35
  def initialize(enc)
36
+ @to_unicode_required = false
37
+
36
38
  if enc.kind_of?(Hash)
37
39
  self.differences=enc[:Differences] if enc[:Differences]
38
40
  enc = enc[:Encoding] || enc[:BaseEncoding]
@@ -74,6 +76,10 @@ class PDF::Reader
74
76
  end
75
77
  end
76
78
 
79
+ def to_unicode_required?
80
+ @to_unicode_required
81
+ end
82
+
77
83
  # set the differences table for this encoding. should be an array in the following format:
78
84
  #
79
85
  # [25, :A, 26, :B]
@@ -101,25 +107,22 @@ class PDF::Reader
101
107
 
102
108
  # convert the specified string to utf8
103
109
  def to_utf8(str, tounicode = nil)
104
-
105
110
  # unpack the single bytes
106
- array_orig = str.unpack(@unpack)
111
+ array_orig = str.unpack(unpack)
107
112
 
108
113
  # replace any relevant bytes with a glyph name
109
114
  array_orig = process_differences(array_orig)
110
115
 
111
116
  # replace any remaining bytes with a unicode codepoint
112
- array_enc = []
113
- array_orig.each do |num|
117
+ array_enc = array_orig.map do |num|
114
118
  if tounicode && (code = tounicode.decode(num))
115
- array_enc << code
116
- elsif tounicode || ( tounicode.nil? && defined?(@to_unicode_required) &&
117
- @to_unicode_required )
118
- array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
119
- elsif defined?(@mapping) && @mapping && @mapping[num]
120
- array_enc << @mapping[num]
119
+ code
120
+ elsif tounicode || ( tounicode.nil? && to_unicode_required? )
121
+ PDF::Reader::Encoding::UNKNOWN_CHAR
122
+ elsif mapping[num]
123
+ mapping[num]
121
124
  else
122
- array_enc << num
125
+ num
123
126
  end
124
127
  end
125
128
 
@@ -140,6 +143,14 @@ class PDF::Reader
140
143
 
141
144
  private
142
145
 
146
+ def mapping
147
+ @mapping ||= {}
148
+ end
149
+
150
+ def has_mapping?
151
+ mapping.size > 0
152
+ end
153
+
143
154
  # accepts an array of byte numbers, and replaces any that have entries in the differences table
144
155
  # with a glyph name
145
156
  def process_differences(arr)
@@ -154,12 +165,13 @@ class PDF::Reader
154
165
  end
155
166
 
156
167
  def load_mapping(file)
157
- @mapping = {}
168
+ return if has_mapping?
169
+
158
170
  RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
159
171
  File.open(file, mode) do |f|
160
172
  f.each do |l|
161
173
  m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
162
- @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
174
+ mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
163
175
  end
164
176
  end
165
177
  end
@@ -33,7 +33,7 @@ class PDF::Reader
33
33
  #
34
34
  # buffer - a PDF::Reader::Buffer object that contains PDF data
35
35
  # xref - a PDF::Reader::XRef object that represents the document's object offsets
36
- def initialize (buffer, xref)
36
+ def initialize (buffer, xref=nil)
37
37
  @buffer = buffer
38
38
  @xref = xref
39
39
  end
@@ -48,7 +48,7 @@ class PDF::Reader
48
48
  case token
49
49
  when PDF::Reader::Reference then return token
50
50
  when nil then return nil
51
- when "/" then return @buffer.token.to_sym
51
+ when "/" then return pdf_name()
52
52
  when "<<" then return dictionary()
53
53
  when "[" then return array()
54
54
  when "(" then return string()
@@ -107,6 +107,16 @@ class PDF::Reader
107
107
  dict
108
108
  end
109
109
  ################################################################################
110
+ # reads a PDF name from the buffer and converts it to a Ruby Symbol
111
+ def pdf_name
112
+ tok = @buffer.token
113
+ tok.scan(/#(\d\d)/).each do |find|
114
+ replace = find[0].hex.chr
115
+ tok.gsub!("#"+find[0], replace)
116
+ end
117
+ tok.to_sym
118
+ end
119
+ ################################################################################
110
120
  # reads a PDF array from the buffer and converts it to a Ruby Array.
111
121
  def array
112
122
  a = []
@@ -141,6 +151,7 @@ class PDF::Reader
141
151
  return "" if str == ")"
142
152
  Error.assert_equal(parse_token, ")")
143
153
 
154
+ str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
144
155
  str.gsub!("\\n","\n")
145
156
  str.gsub!("\\r","\r")
146
157
  str.gsub!("\\t","\t")
@@ -150,7 +161,6 @@ class PDF::Reader
150
161
  str.gsub!("\\)",")")
151
162
  str.gsub!("\\\\","\\")
152
163
  str.gsub!(/\\\n/m,"")
153
- str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
154
164
 
155
165
  str.scan(/\\\d{1,3}/).each do |octal|
156
166
  str.gsub!(octal, octal[1,3].oct.chr)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.3
4
+ version: 0.8.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-14 00:00:00 +11:00
12
+ date: 2010-03-30 00:00:00 +05:30
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
43
43
  - examples/hash.rb
44
44
  - examples/callbacks.rb
45
45
  - examples/text.rb
46
+ - examples/version.rb
46
47
  - examples/page_counter_improved.rb
47
48
  - lib/pdf/reader/glyphlist.txt
48
49
  - lib/pdf/reader/content.rb
@@ -70,6 +71,7 @@ files:
70
71
  - lib/pdf/reader/parser.rb
71
72
  - lib/pdf/hash.rb
72
73
  - lib/pdf/reader.rb
74
+ - lib/pdf-reader.rb
73
75
  - Rakefile
74
76
  - README.rdoc
75
77
  - TODO