pdf-reader 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ v0.8.4 (XXX)
2
+ - fix parsing of files that use Form XObjects
3
+ - thanks to Andrea Barisani for reporting the issue
4
+ - fix two issues that caused a small number of characters to convert to Unicode
5
+ incorrectly
6
+ - thanks to Andrea Barisani for reporting the issue
7
+ - require 'pdf-reader' now works a well as 'pdf/reader'
8
+ - good practice to have the require file match the gem name
9
+ - thanks to Chris O'Meara for highlighting this
10
+
1
11
  v0.8.3 (14th February 2010)
2
12
  - Fix a bug in tokenising of hex strings inside dictionaries
3
13
  - Thanks to Brad Ediger for detecting the issue and proposing a solution
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.8.3"
9
+ PKG_VERSION = "0.8.4"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
@@ -29,6 +29,7 @@ class PageTextReceiver
29
29
 
30
30
  def show_text_with_positioning(*params)
31
31
  params = params.first
32
+ params ||= []
32
33
  params.each { |str| show_text(str) if str.kind_of?(String)}
33
34
  end
34
35
  end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ # Determine the PDF version of a file
5
+
6
+ require 'rubygems'
7
+ require 'pdf/reader'
8
+
9
+ class VersionReceiver
10
+ attr_accessor :version
11
+
12
+ def initialize
13
+ @version = nil
14
+ end
15
+
16
+ # Called when document parsing starts
17
+ def pdf_version(arg = nil)
18
+ @version = arg
19
+ end
20
+
21
+ end
22
+
23
+ receiver = VersionReceiver.new
24
+ pdf = PDF::Reader.file(ARGV.shift, receiver)
25
+ puts receiver.version
@@ -0,0 +1 @@
1
+ require "pdf/reader"
@@ -9,10 +9,10 @@
9
9
  # distribute, sublicense, and/or sell copies of the Software, and to
10
10
  # permit persons to whom the Software is furnished to do so, subject to
11
11
  # the following conditions:
12
- #
12
+ #
13
13
  # The above copyright notice and this permission notice shall be
14
14
  # included in all copies or substantial portions of the Software.
15
- #
15
+ #
16
16
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
17
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
18
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -30,26 +30,33 @@ class PDF::Reader
30
30
  @map = {}
31
31
  in_char_mode = false
32
32
  in_range_mode = false
33
+ instructions = ""
33
34
 
34
35
  data.each_line do |l|
35
36
  if l.include?("beginbfchar")
36
- in_char_mode = true
37
+ in_char_mode = true
37
38
  elsif l.include?("endbfchar")
38
- in_char_mode = false
39
+ process_bfchar_instructions(instructions)
40
+ instructions = ""
41
+ in_char_mode = false
39
42
  elsif l.include?("beginbfrange")
40
- in_range_mode = true
43
+ in_range_mode = true
41
44
  elsif l.include?("endbfrange")
42
- in_range_mode = false
45
+ process_bfrange_instructions(instructions)
46
+ instructions = ""
47
+ in_range_mode = false
43
48
  end
44
49
 
45
- if in_char_mode
46
- process_bfchar_line(l)
47
- elsif in_range_mode
48
- process_bfrange_line(l)
50
+ if !l.include?("begin") && (in_char_mode || in_range_mode)
51
+ instructions << l
49
52
  end
50
53
  end
51
54
  end
52
55
 
56
+ def size
57
+ @map.size
58
+ end
59
+
53
60
  def decode(c)
54
61
  # TODO: implement the conversion
55
62
  return c unless c.class == Fixnum
@@ -58,24 +65,72 @@ class PDF::Reader
58
65
 
59
66
  private
60
67
 
61
- def process_bfchar_line(l)
62
- m, find, replace = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
63
- @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
68
+ def build_parser(instructions)
69
+ buffer = Buffer.new(StringIO.new(instructions))
70
+ Parser.new(buffer)
71
+ end
72
+
73
+ def str_to_int(str)
74
+ return nil if str.nil? || str.size == 0 || str.size >= 3
75
+
76
+ if str.size == 1
77
+ str.unpack("C*")[0]
78
+ else
79
+ str.unpack("n*")[0]
80
+ end
64
81
  end
65
82
 
66
- def process_bfrange_line(l)
67
- m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>\s*<([0-9a-fA-F]+)>/)
68
- if start_code && end_code && dst
69
- start_code = "0x#{start_code}".hex
70
- end_code = "0x#{end_code}".hex
71
- dst = "0x#{dst}".hex
72
-
73
- # add all values in the range to our mapping
74
- (start_code..end_code).each_with_index do |val, idx|
75
- @map[val] = dst + idx
76
- # ensure a single range does not exceed 255 chars
77
- raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
83
+ def process_bfchar_instructions(instructions)
84
+ parser = build_parser(instructions)
85
+ find = str_to_int(parser.parse_token)
86
+ replace = str_to_int(parser.parse_token)
87
+ while find && replace
88
+ @map[find] = replace
89
+ find = str_to_int(parser.parse_token)
90
+ replace = str_to_int(parser.parse_token)
91
+ end
92
+ end
93
+
94
+ def process_bfrange_instructions(instructions)
95
+ parser = build_parser(instructions)
96
+ start = parser.parse_token
97
+ finish = parser.parse_token
98
+ to = parser.parse_token
99
+ while start && finish && to
100
+ if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
101
+ bfrange_type_one(start, finish, to)
102
+ elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
103
+ bfrange_type_two(start, finish, to)
104
+ else
105
+ raise "invalid bfrange section"
78
106
  end
107
+ start = parser.parse_token
108
+ finish = parser.parse_token
109
+ to = parser.parse_token
110
+ end
111
+ end
112
+
113
+ def bfrange_type_one(start_code, end_code, dst)
114
+ start_code = str_to_int(start_code)
115
+ end_code = str_to_int(end_code)
116
+ dst = str_to_int(dst)
117
+
118
+ # add all values in the range to our mapping
119
+ (start_code..end_code).each_with_index do |val, idx|
120
+ @map[val] = dst + idx
121
+ # ensure a single range does not exceed 255 chars
122
+ raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
123
+ end
124
+ end
125
+
126
+ def bfrange_type_two(start_code, end_code, dst)
127
+ start_code = str_to_int(start_code)
128
+ end_code = str_to_int(end_code)
129
+ from_range = (start_code..end_code)
130
+
131
+ # add all values in the range to our mapping
132
+ from_range.each_with_index do |val, idx|
133
+ @map[val] = str_to_int(dst[idx])
79
134
  end
80
135
  end
81
136
  end
@@ -251,7 +251,6 @@ class PDF::Reader
251
251
  def initialize (receiver, xref)
252
252
  @receiver = receiver
253
253
  @xref = xref
254
- @fonts ||= {}
255
254
  end
256
255
  ################################################################################
257
256
  # Begin processing the document metadata
@@ -309,10 +308,14 @@ class PDF::Reader
309
308
  contents = [page[:Contents]]
310
309
  end
311
310
 
312
- contents.each do |content|
313
- obj = @xref.object(content)
314
- content_stream(obj)
315
- end if page.has_key?(:Contents) and page[:Contents]
311
+ fonts = font_hash_from_resources(current_resources)
312
+
313
+ if page.has_key?(:Contents) and page[:Contents]
314
+ contents.each do |content|
315
+ obj = @xref.object(content)
316
+ content_stream(obj, fonts)
317
+ end
318
+ end
316
319
 
317
320
  resources.pop if res
318
321
  callback(:end_page)
@@ -330,7 +333,8 @@ class PDF::Reader
330
333
  callback(:begin_form_xobject)
331
334
  resources = @xref.object(xobject.hash[:Resources])
332
335
  walk_resources(resources) if resources
333
- content_stream(xobject)
336
+ fonts = font_hash_from_resources(resources)
337
+ content_stream(xobject, fonts)
334
338
  callback(:end_form_xobject)
335
339
  end
336
340
  end
@@ -348,42 +352,43 @@ class PDF::Reader
348
352
  ################################################################################
349
353
  # Reads a PDF content stream and calls all the appropriate callback methods for the operators
350
354
  # it contains
351
- def content_stream (instructions)
355
+ def content_stream (instructions, fonts = {})
352
356
  instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
353
- @buffer = Buffer.new(StringIO.new(instructions))
354
- @parser = Parser.new(@buffer, @xref)
355
- @params ||= []
357
+ buffer = Buffer.new(StringIO.new(instructions))
358
+ parser = Parser.new(buffer, @xref)
359
+ current_font = nil
360
+ params = []
356
361
 
357
- while (token = @parser.parse_token(OPERATORS))
362
+ while (token = parser.parse_token(OPERATORS))
358
363
  if token.kind_of?(Token) and OPERATORS.has_key?(token)
359
- @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
364
+ current_font = params.first if OPERATORS[token] == :set_text_font_and_size
360
365
 
361
366
  # handle special cases in response to certain operators
362
- if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
367
+ if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
363
368
  # convert any text to utf-8
364
- @params = @fonts[@current_font].to_utf8(@params)
369
+ params = fonts[current_font].to_utf8(params)
365
370
  elsif token == "ID"
366
371
  # inline image data, first convert the current params into a more familiar hash
367
372
  map = {}
368
- @params.each_slice(2) do |a|
373
+ params.each_slice(2) do |a|
369
374
  map[a.first] = a.last
370
375
  end
371
- @params = [map]
376
+ params = [map]
372
377
  # read the raw image data from the buffer without tokenising
373
- @params << @buffer.read_until("EI")
378
+ params << buffer.read_until("EI")
374
379
  end
375
380
 
376
- callback(OPERATORS[token], @params)
381
+ callback(OPERATORS[token], params)
377
382
 
378
383
  if OPERATORS[token] == :invoke_xobject
379
- xobject_label = @params.first
380
- @params.clear
384
+ xobject_label = params.first
385
+ params.clear
381
386
  walk_xobject_form(xobject_label)
382
387
  else
383
- @params.clear
388
+ params.clear
384
389
  end
385
390
  else
386
- @params << token
391
+ params << token
387
392
  end
388
393
  end
389
394
  rescue EOFError => e
@@ -430,24 +435,9 @@ class PDF::Reader
430
435
 
431
436
  # extract any font information
432
437
  if resources[:Font]
433
- @xref.object(resources[:Font]).each do |label, desc|
434
- desc = @xref.object(desc)
435
- @fonts[label] = PDF::Reader::Font.new
436
- @fonts[label].label = label
437
- @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
438
- @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
439
- @fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
440
- @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
441
- if desc[:ToUnicode]
442
- # this stream is a cmap
443
- begin
444
- stream = desc[:ToUnicode]
445
- @fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
446
- rescue
447
- # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
448
- end
449
- end
450
- callback(:resource_font, [label, @fonts[label]])
438
+ fonts = font_hash_from_resources(resources)
439
+ fonts.each do |label, font|
440
+ callback(:resource_font, [label, fonts])
451
441
  end
452
442
  end
453
443
  end
@@ -473,6 +463,32 @@ class PDF::Reader
473
463
  end
474
464
  ################################################################################
475
465
  private
466
+ ################################################################################
467
+ def font_hash_from_resources(resources)
468
+ return {} unless resources.respond_to?(:[])
469
+
470
+ fonts = {}
471
+ resources = @xref.object(resources[:Font]) || {}
472
+ resources.each do |label, desc|
473
+ desc = @xref.object(desc)
474
+ fonts[label] = PDF::Reader::Font.new
475
+ fonts[label].label = label
476
+ fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
477
+ fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
478
+ fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
479
+ fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
480
+ if desc[:ToUnicode]
481
+ # this stream is a cmap
482
+ begin
483
+ stream = desc[:ToUnicode]
484
+ fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
485
+ rescue
486
+ # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
487
+ end
488
+ end
489
+ end
490
+ fonts
491
+ end
476
492
  # strings outside of page content should be in either PDFDocEncoding or UTF-16.
477
493
  def decode_strings(obj)
478
494
  case obj
@@ -30,9 +30,11 @@ class PDF::Reader
30
30
 
31
31
  UNKNOWN_CHAR = 0x25AF # ▯
32
32
 
33
- attr_reader :differences
33
+ attr_reader :differences, :unpack
34
34
 
35
35
  def initialize(enc)
36
+ @to_unicode_required = false
37
+
36
38
  if enc.kind_of?(Hash)
37
39
  self.differences=enc[:Differences] if enc[:Differences]
38
40
  enc = enc[:Encoding] || enc[:BaseEncoding]
@@ -74,6 +76,10 @@ class PDF::Reader
74
76
  end
75
77
  end
76
78
 
79
+ def to_unicode_required?
80
+ @to_unicode_required
81
+ end
82
+
77
83
  # set the differences table for this encoding. should be an array in the following format:
78
84
  #
79
85
  # [25, :A, 26, :B]
@@ -101,25 +107,22 @@ class PDF::Reader
101
107
 
102
108
  # convert the specified string to utf8
103
109
  def to_utf8(str, tounicode = nil)
104
-
105
110
  # unpack the single bytes
106
- array_orig = str.unpack(@unpack)
111
+ array_orig = str.unpack(unpack)
107
112
 
108
113
  # replace any relevant bytes with a glyph name
109
114
  array_orig = process_differences(array_orig)
110
115
 
111
116
  # replace any remaining bytes with a unicode codepoint
112
- array_enc = []
113
- array_orig.each do |num|
117
+ array_enc = array_orig.map do |num|
114
118
  if tounicode && (code = tounicode.decode(num))
115
- array_enc << code
116
- elsif tounicode || ( tounicode.nil? && defined?(@to_unicode_required) &&
117
- @to_unicode_required )
118
- array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
119
- elsif defined?(@mapping) && @mapping && @mapping[num]
120
- array_enc << @mapping[num]
119
+ code
120
+ elsif tounicode || ( tounicode.nil? && to_unicode_required? )
121
+ PDF::Reader::Encoding::UNKNOWN_CHAR
122
+ elsif mapping[num]
123
+ mapping[num]
121
124
  else
122
- array_enc << num
125
+ num
123
126
  end
124
127
  end
125
128
 
@@ -140,6 +143,14 @@ class PDF::Reader
140
143
 
141
144
  private
142
145
 
146
+ def mapping
147
+ @mapping ||= {}
148
+ end
149
+
150
+ def has_mapping?
151
+ mapping.size > 0
152
+ end
153
+
143
154
  # accepts an array of byte numbers, and replaces any that have entries in the differences table
144
155
  # with a glyph name
145
156
  def process_differences(arr)
@@ -154,12 +165,13 @@ class PDF::Reader
154
165
  end
155
166
 
156
167
  def load_mapping(file)
157
- @mapping = {}
168
+ return if has_mapping?
169
+
158
170
  RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
159
171
  File.open(file, mode) do |f|
160
172
  f.each do |l|
161
173
  m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
162
- @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
174
+ mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
163
175
  end
164
176
  end
165
177
  end
@@ -33,7 +33,7 @@ class PDF::Reader
33
33
  #
34
34
  # buffer - a PDF::Reader::Buffer object that contains PDF data
35
35
  # xref - a PDF::Reader::XRef object that represents the document's object offsets
36
- def initialize (buffer, xref)
36
+ def initialize (buffer, xref=nil)
37
37
  @buffer = buffer
38
38
  @xref = xref
39
39
  end
@@ -48,7 +48,7 @@ class PDF::Reader
48
48
  case token
49
49
  when PDF::Reader::Reference then return token
50
50
  when nil then return nil
51
- when "/" then return @buffer.token.to_sym
51
+ when "/" then return pdf_name()
52
52
  when "<<" then return dictionary()
53
53
  when "[" then return array()
54
54
  when "(" then return string()
@@ -107,6 +107,16 @@ class PDF::Reader
107
107
  dict
108
108
  end
109
109
  ################################################################################
110
+ # reads a PDF name from the buffer and converts it to a Ruby Symbol
111
+ def pdf_name
112
+ tok = @buffer.token
113
+ tok.scan(/#(\d\d)/).each do |find|
114
+ replace = find[0].hex.chr
115
+ tok.gsub!("#"+find[0], replace)
116
+ end
117
+ tok.to_sym
118
+ end
119
+ ################################################################################
110
120
  # reads a PDF array from the buffer and converts it to a Ruby Array.
111
121
  def array
112
122
  a = []
@@ -141,6 +151,7 @@ class PDF::Reader
141
151
  return "" if str == ")"
142
152
  Error.assert_equal(parse_token, ")")
143
153
 
154
+ str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
144
155
  str.gsub!("\\n","\n")
145
156
  str.gsub!("\\r","\r")
146
157
  str.gsub!("\\t","\t")
@@ -150,7 +161,6 @@ class PDF::Reader
150
161
  str.gsub!("\\)",")")
151
162
  str.gsub!("\\\\","\\")
152
163
  str.gsub!(/\\\n/m,"")
153
- str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
154
164
 
155
165
  str.scan(/\\\d{1,3}/).each do |octal|
156
166
  str.gsub!(octal, octal[1,3].oct.chr)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.3
4
+ version: 0.8.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-14 00:00:00 +11:00
12
+ date: 2010-03-30 00:00:00 +05:30
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -43,6 +43,7 @@ files:
43
43
  - examples/hash.rb
44
44
  - examples/callbacks.rb
45
45
  - examples/text.rb
46
+ - examples/version.rb
46
47
  - examples/page_counter_improved.rb
47
48
  - lib/pdf/reader/glyphlist.txt
48
49
  - lib/pdf/reader/content.rb
@@ -70,6 +71,7 @@ files:
70
71
  - lib/pdf/reader/parser.rb
71
72
  - lib/pdf/hash.rb
72
73
  - lib/pdf/reader.rb
74
+ - lib/pdf-reader.rb
73
75
  - Rakefile
74
76
  - README.rdoc
75
77
  - TODO