pdf-reader 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +10 -0
- data/Rakefile +1 -1
- data/bin/pdf_text +1 -0
- data/examples/version.rb +25 -0
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader/cmap.rb +80 -25
- data/lib/pdf/reader/content.rb +56 -40
- data/lib/pdf/reader/encoding.rb +26 -14
- data/lib/pdf/reader/parser.rb +13 -3
- metadata +4 -2
    
        data/CHANGELOG
    CHANGED
    
    | @@ -1,3 +1,13 @@ | |
| 1 | 
            +
            v0.8.4 (XXX)
         | 
| 2 | 
            +
            - fix parsing of files that use Form XObjects
         | 
| 3 | 
            +
              - thanks to Andrea Barisani for reporting the issue
         | 
| 4 | 
            +
            - fix two issues that caused a small number of characters to convert to Unicode
         | 
| 5 | 
            +
              incorrectly
         | 
| 6 | 
            +
              - thanks to Andrea Barisani for reporting the issue
         | 
| 7 | 
            +
            - require 'pdf-reader' now works a well as 'pdf/reader'
         | 
| 8 | 
            +
              - good practice to have the require file match the gem name
         | 
| 9 | 
            +
              - thanks to Chris O'Meara for highlighting this
         | 
| 10 | 
            +
             | 
| 1 11 | 
             
            v0.8.3 (14th February 2010)
         | 
| 2 12 | 
             
            - Fix a bug in tokenising of hex strings inside dictionaries
         | 
| 3 13 | 
             
              - Thanks to Brad Ediger for detecting the issue and proposing a solution
         | 
    
        data/Rakefile
    CHANGED
    
    
    
        data/bin/pdf_text
    CHANGED
    
    
    
        data/examples/version.rb
    ADDED
    
    | @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            # coding: utf-8
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Determine the PDF version of a file
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            require 'rubygems'
         | 
| 7 | 
            +
            require 'pdf/reader'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            class VersionReceiver
         | 
| 10 | 
            +
              attr_accessor :version
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              def initialize
         | 
| 13 | 
            +
                @version = nil
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              # Called when document parsing starts
         | 
| 17 | 
            +
              def pdf_version(arg = nil)
         | 
| 18 | 
            +
                @version = arg
         | 
| 19 | 
            +
              end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            receiver = VersionReceiver.new
         | 
| 24 | 
            +
            pdf = PDF::Reader.file(ARGV.shift, receiver)
         | 
| 25 | 
            +
            puts receiver.version
         | 
    
        data/lib/pdf-reader.rb
    ADDED
    
    | @@ -0,0 +1 @@ | |
| 1 | 
            +
            require "pdf/reader"
         | 
    
        data/lib/pdf/reader/cmap.rb
    CHANGED
    
    | @@ -9,10 +9,10 @@ | |
| 9 9 | 
             
            # distribute, sublicense, and/or sell copies of the Software, and to
         | 
| 10 10 | 
             
            # permit persons to whom the Software is furnished to do so, subject to
         | 
| 11 11 | 
             
            # the following conditions:
         | 
| 12 | 
            -
            # | 
| 12 | 
            +
            #
         | 
| 13 13 | 
             
            # The above copyright notice and this permission notice shall be
         | 
| 14 14 | 
             
            # included in all copies or substantial portions of the Software.
         | 
| 15 | 
            -
            # | 
| 15 | 
            +
            #
         | 
| 16 16 | 
             
            # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         | 
| 17 17 | 
             
            # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         | 
| 18 18 | 
             
            # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         | 
| @@ -30,26 +30,33 @@ class PDF::Reader | |
| 30 30 | 
             
                  @map = {}
         | 
| 31 31 | 
             
                  in_char_mode = false
         | 
| 32 32 | 
             
                  in_range_mode = false
         | 
| 33 | 
            +
                  instructions = ""
         | 
| 33 34 |  | 
| 34 35 | 
             
                  data.each_line do |l|
         | 
| 35 36 | 
             
                    if l.include?("beginbfchar")
         | 
| 36 | 
            -
                      in_char_mode = true | 
| 37 | 
            +
                      in_char_mode = true
         | 
| 37 38 | 
             
                    elsif l.include?("endbfchar")
         | 
| 38 | 
            -
                       | 
| 39 | 
            +
                      process_bfchar_instructions(instructions)
         | 
| 40 | 
            +
                      instructions = ""
         | 
| 41 | 
            +
                      in_char_mode = false
         | 
| 39 42 | 
             
                    elsif l.include?("beginbfrange")
         | 
| 40 | 
            -
                      in_range_mode = true | 
| 43 | 
            +
                      in_range_mode = true
         | 
| 41 44 | 
             
                    elsif l.include?("endbfrange")
         | 
| 42 | 
            -
                       | 
| 45 | 
            +
                      process_bfrange_instructions(instructions)
         | 
| 46 | 
            +
                      instructions = ""
         | 
| 47 | 
            +
                      in_range_mode = false
         | 
| 43 48 | 
             
                    end
         | 
| 44 49 |  | 
| 45 | 
            -
                    if in_char_mode
         | 
| 46 | 
            -
                       | 
| 47 | 
            -
                    elsif in_range_mode
         | 
| 48 | 
            -
                      process_bfrange_line(l)
         | 
| 50 | 
            +
                    if !l.include?("begin") && (in_char_mode || in_range_mode)
         | 
| 51 | 
            +
                      instructions << l
         | 
| 49 52 | 
             
                    end
         | 
| 50 53 | 
             
                  end
         | 
| 51 54 | 
             
                end
         | 
| 52 55 |  | 
| 56 | 
            +
                def size
         | 
| 57 | 
            +
                  @map.size
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
             | 
| 53 60 | 
             
                def decode(c)
         | 
| 54 61 | 
             
                  # TODO: implement the conversion
         | 
| 55 62 | 
             
                  return c unless c.class == Fixnum
         | 
| @@ -58,24 +65,72 @@ class PDF::Reader | |
| 58 65 |  | 
| 59 66 | 
             
                private
         | 
| 60 67 |  | 
| 61 | 
            -
                def  | 
| 62 | 
            -
                   | 
| 63 | 
            -
                   | 
| 68 | 
            +
                def build_parser(instructions)
         | 
| 69 | 
            +
                  buffer = Buffer.new(StringIO.new(instructions))
         | 
| 70 | 
            +
                  Parser.new(buffer)
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                def str_to_int(str)
         | 
| 74 | 
            +
                  return nil if str.nil? || str.size == 0 || str.size >= 3
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                  if str.size == 1
         | 
| 77 | 
            +
                    str.unpack("C*")[0]
         | 
| 78 | 
            +
                  else
         | 
| 79 | 
            +
                    str.unpack("n*")[0]
         | 
| 80 | 
            +
                  end
         | 
| 64 81 | 
             
                end
         | 
| 65 82 |  | 
| 66 | 
            -
                def  | 
| 67 | 
            -
                   | 
| 68 | 
            -
                   | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
                     | 
| 72 | 
            -
             | 
| 73 | 
            -
                     | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 83 | 
            +
                def process_bfchar_instructions(instructions)
         | 
| 84 | 
            +
                  parser  = build_parser(instructions)
         | 
| 85 | 
            +
                  find    = str_to_int(parser.parse_token)
         | 
| 86 | 
            +
                  replace = str_to_int(parser.parse_token)
         | 
| 87 | 
            +
                  while find && replace
         | 
| 88 | 
            +
                    @map[find] = replace
         | 
| 89 | 
            +
                    find       = str_to_int(parser.parse_token)
         | 
| 90 | 
            +
                    replace    = str_to_int(parser.parse_token)
         | 
| 91 | 
            +
                  end
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                def process_bfrange_instructions(instructions)
         | 
| 95 | 
            +
                  parser  = build_parser(instructions)
         | 
| 96 | 
            +
                  start   = parser.parse_token
         | 
| 97 | 
            +
                  finish  = parser.parse_token
         | 
| 98 | 
            +
                  to      = parser.parse_token
         | 
| 99 | 
            +
                  while start && finish && to
         | 
| 100 | 
            +
                    if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
         | 
| 101 | 
            +
                      bfrange_type_one(start, finish, to)
         | 
| 102 | 
            +
                    elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
         | 
| 103 | 
            +
                      bfrange_type_two(start, finish, to)
         | 
| 104 | 
            +
                    else
         | 
| 105 | 
            +
                      raise "invalid bfrange section"
         | 
| 78 106 | 
             
                    end
         | 
| 107 | 
            +
                    start   = parser.parse_token
         | 
| 108 | 
            +
                    finish  = parser.parse_token
         | 
| 109 | 
            +
                    to      = parser.parse_token
         | 
| 110 | 
            +
                  end
         | 
| 111 | 
            +
                end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                def bfrange_type_one(start_code, end_code, dst)
         | 
| 114 | 
            +
                  start_code = str_to_int(start_code)
         | 
| 115 | 
            +
                  end_code   = str_to_int(end_code)
         | 
| 116 | 
            +
                  dst        = str_to_int(dst)
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                  # add all values in the range to our mapping
         | 
| 119 | 
            +
                  (start_code..end_code).each_with_index do |val, idx|
         | 
| 120 | 
            +
                    @map[val] = dst + idx
         | 
| 121 | 
            +
                    # ensure a single range does not exceed 255 chars
         | 
| 122 | 
            +
                    raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
         | 
| 123 | 
            +
                  end
         | 
| 124 | 
            +
                end
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                def bfrange_type_two(start_code, end_code, dst)
         | 
| 127 | 
            +
                  start_code = str_to_int(start_code)
         | 
| 128 | 
            +
                  end_code   = str_to_int(end_code)
         | 
| 129 | 
            +
                  from_range = (start_code..end_code)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                  # add all values in the range to our mapping
         | 
| 132 | 
            +
                  from_range.each_with_index do |val, idx|
         | 
| 133 | 
            +
                    @map[val] = str_to_int(dst[idx])
         | 
| 79 134 | 
             
                  end
         | 
| 80 135 | 
             
                end
         | 
| 81 136 | 
             
              end
         | 
    
        data/lib/pdf/reader/content.rb
    CHANGED
    
    | @@ -251,7 +251,6 @@ class PDF::Reader | |
| 251 251 | 
             
                def initialize (receiver, xref)
         | 
| 252 252 | 
             
                  @receiver = receiver
         | 
| 253 253 | 
             
                  @xref     = xref
         | 
| 254 | 
            -
                  @fonts ||= {}
         | 
| 255 254 | 
             
                end
         | 
| 256 255 | 
             
                ################################################################################
         | 
| 257 256 | 
             
                # Begin processing the document metadata
         | 
| @@ -309,10 +308,14 @@ class PDF::Reader | |
| 309 308 | 
             
                      contents = [page[:Contents]]
         | 
| 310 309 | 
             
                    end
         | 
| 311 310 |  | 
| 312 | 
            -
                     | 
| 313 | 
            -
             | 
| 314 | 
            -
             | 
| 315 | 
            -
             | 
| 311 | 
            +
                    fonts = font_hash_from_resources(current_resources)
         | 
| 312 | 
            +
             | 
| 313 | 
            +
                    if page.has_key?(:Contents) and page[:Contents]
         | 
| 314 | 
            +
                      contents.each do |content|
         | 
| 315 | 
            +
                        obj = @xref.object(content)
         | 
| 316 | 
            +
                        content_stream(obj, fonts)
         | 
| 317 | 
            +
                      end 
         | 
| 318 | 
            +
                    end
         | 
| 316 319 |  | 
| 317 320 | 
             
                    resources.pop if res
         | 
| 318 321 | 
             
                    callback(:end_page)
         | 
| @@ -330,7 +333,8 @@ class PDF::Reader | |
| 330 333 | 
             
                    callback(:begin_form_xobject)
         | 
| 331 334 | 
             
                    resources = @xref.object(xobject.hash[:Resources])
         | 
| 332 335 | 
             
                    walk_resources(resources) if resources
         | 
| 333 | 
            -
                     | 
| 336 | 
            +
                    fonts = font_hash_from_resources(resources)
         | 
| 337 | 
            +
                    content_stream(xobject, fonts)
         | 
| 334 338 | 
             
                    callback(:end_form_xobject)
         | 
| 335 339 | 
             
                  end
         | 
| 336 340 | 
             
                end
         | 
| @@ -348,42 +352,43 @@ class PDF::Reader | |
| 348 352 | 
             
                ################################################################################
         | 
| 349 353 | 
             
                # Reads a PDF content stream and calls all the appropriate callback methods for the operators
         | 
| 350 354 | 
             
                # it contains
         | 
| 351 | 
            -
                def content_stream (instructions)
         | 
| 355 | 
            +
                def content_stream (instructions, fonts = {})
         | 
| 352 356 | 
             
                  instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
         | 
| 353 | 
            -
                   | 
| 354 | 
            -
                   | 
| 355 | 
            -
                   | 
| 357 | 
            +
                  buffer       = Buffer.new(StringIO.new(instructions))
         | 
| 358 | 
            +
                  parser       = Parser.new(buffer, @xref)
         | 
| 359 | 
            +
                  current_font = nil
         | 
| 360 | 
            +
                  params       = []
         | 
| 356 361 |  | 
| 357 | 
            -
                  while (token =  | 
| 362 | 
            +
                  while (token = parser.parse_token(OPERATORS))
         | 
| 358 363 | 
             
                    if token.kind_of?(Token) and OPERATORS.has_key?(token)
         | 
| 359 | 
            -
                       | 
| 364 | 
            +
                      current_font = params.first if OPERATORS[token] == :set_text_font_and_size
         | 
| 360 365 |  | 
| 361 366 | 
             
                      # handle special cases in response to certain operators
         | 
| 362 | 
            -
                      if OPERATORS[token].to_s.include?("show_text") &&  | 
| 367 | 
            +
                      if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
         | 
| 363 368 | 
             
                        # convert any text to utf-8
         | 
| 364 | 
            -
                         | 
| 369 | 
            +
                        params = fonts[current_font].to_utf8(params)
         | 
| 365 370 | 
             
                      elsif token == "ID"
         | 
| 366 371 | 
             
                        # inline image data, first convert the current params into a more familiar hash
         | 
| 367 372 | 
             
                        map = {}
         | 
| 368 | 
            -
                         | 
| 373 | 
            +
                        params.each_slice(2) do |a|
         | 
| 369 374 | 
             
                          map[a.first] = a.last
         | 
| 370 375 | 
             
                        end
         | 
| 371 | 
            -
                         | 
| 376 | 
            +
                        params = [map]
         | 
| 372 377 | 
             
                        # read the raw image data from the buffer without tokenising
         | 
| 373 | 
            -
                         | 
| 378 | 
            +
                        params << buffer.read_until("EI")
         | 
| 374 379 | 
             
                      end
         | 
| 375 380 |  | 
| 376 | 
            -
                      callback(OPERATORS[token],  | 
| 381 | 
            +
                      callback(OPERATORS[token], params)
         | 
| 377 382 |  | 
| 378 383 | 
             
                      if OPERATORS[token] == :invoke_xobject
         | 
| 379 | 
            -
                        xobject_label =  | 
| 380 | 
            -
                         | 
| 384 | 
            +
                        xobject_label = params.first
         | 
| 385 | 
            +
                        params.clear
         | 
| 381 386 | 
             
                        walk_xobject_form(xobject_label)
         | 
| 382 387 | 
             
                      else
         | 
| 383 | 
            -
                         | 
| 388 | 
            +
                        params.clear
         | 
| 384 389 | 
             
                      end
         | 
| 385 390 | 
             
                    else
         | 
| 386 | 
            -
                       | 
| 391 | 
            +
                      params << token
         | 
| 387 392 | 
             
                    end
         | 
| 388 393 | 
             
                  end
         | 
| 389 394 | 
             
                rescue EOFError => e
         | 
| @@ -430,24 +435,9 @@ class PDF::Reader | |
| 430 435 |  | 
| 431 436 | 
             
                  # extract any font information
         | 
| 432 437 | 
             
                  if resources[:Font]
         | 
| 433 | 
            -
                     | 
| 434 | 
            -
             | 
| 435 | 
            -
                       | 
| 436 | 
            -
                      @fonts[label].label = label
         | 
| 437 | 
            -
                      @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
         | 
| 438 | 
            -
                      @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
         | 
| 439 | 
            -
                      @fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
         | 
| 440 | 
            -
                      @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
         | 
| 441 | 
            -
                      if desc[:ToUnicode]
         | 
| 442 | 
            -
                        # this stream is a cmap
         | 
| 443 | 
            -
                        begin
         | 
| 444 | 
            -
                          stream = desc[:ToUnicode]
         | 
| 445 | 
            -
                          @fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
         | 
| 446 | 
            -
                        rescue
         | 
| 447 | 
            -
                          # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
         | 
| 448 | 
            -
                        end
         | 
| 449 | 
            -
                      end
         | 
| 450 | 
            -
                      callback(:resource_font, [label, @fonts[label]])
         | 
| 438 | 
            +
                    fonts = font_hash_from_resources(resources)
         | 
| 439 | 
            +
                    fonts.each do  |label, font|
         | 
| 440 | 
            +
                      callback(:resource_font, [label, fonts])
         | 
| 451 441 | 
             
                    end
         | 
| 452 442 | 
             
                  end
         | 
| 453 443 | 
             
                end
         | 
| @@ -473,6 +463,32 @@ class PDF::Reader | |
| 473 463 | 
             
                end
         | 
| 474 464 | 
             
                ################################################################################
         | 
| 475 465 | 
             
                private
         | 
| 466 | 
            +
                ################################################################################
         | 
| 467 | 
            +
                def font_hash_from_resources(resources)
         | 
| 468 | 
            +
                  return {} unless resources.respond_to?(:[])
         | 
| 469 | 
            +
             | 
| 470 | 
            +
                  fonts = {}
         | 
| 471 | 
            +
                  resources = @xref.object(resources[:Font]) || {}
         | 
| 472 | 
            +
                  resources.each do |label, desc|
         | 
| 473 | 
            +
                    desc = @xref.object(desc)
         | 
| 474 | 
            +
                    fonts[label] = PDF::Reader::Font.new
         | 
| 475 | 
            +
                    fonts[label].label = label
         | 
| 476 | 
            +
                    fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
         | 
| 477 | 
            +
                    fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
         | 
| 478 | 
            +
                    fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
         | 
| 479 | 
            +
                    fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
         | 
| 480 | 
            +
                    if desc[:ToUnicode]
         | 
| 481 | 
            +
                      # this stream is a cmap
         | 
| 482 | 
            +
                      begin
         | 
| 483 | 
            +
                        stream = desc[:ToUnicode]
         | 
| 484 | 
            +
                        fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
         | 
| 485 | 
            +
                      rescue
         | 
| 486 | 
            +
                        # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
         | 
| 487 | 
            +
                      end
         | 
| 488 | 
            +
                    end
         | 
| 489 | 
            +
                  end
         | 
| 490 | 
            +
                  fonts
         | 
| 491 | 
            +
                end
         | 
| 476 492 | 
             
                # strings outside of page content should be in either PDFDocEncoding or UTF-16.
         | 
| 477 493 | 
             
                def decode_strings(obj)
         | 
| 478 494 | 
             
                  case obj
         | 
    
        data/lib/pdf/reader/encoding.rb
    CHANGED
    
    | @@ -30,9 +30,11 @@ class PDF::Reader | |
| 30 30 |  | 
| 31 31 | 
             
                UNKNOWN_CHAR = 0x25AF # ▯
         | 
| 32 32 |  | 
| 33 | 
            -
                attr_reader :differences
         | 
| 33 | 
            +
                attr_reader :differences, :unpack
         | 
| 34 34 |  | 
| 35 35 | 
             
                def initialize(enc)
         | 
| 36 | 
            +
                  @to_unicode_required = false
         | 
| 37 | 
            +
             | 
| 36 38 | 
             
                  if enc.kind_of?(Hash)
         | 
| 37 39 | 
             
                    self.differences=enc[:Differences] if enc[:Differences]
         | 
| 38 40 | 
             
                    enc = enc[:Encoding] || enc[:BaseEncoding]
         | 
| @@ -74,6 +76,10 @@ class PDF::Reader | |
| 74 76 | 
             
                  end
         | 
| 75 77 | 
             
                end
         | 
| 76 78 |  | 
| 79 | 
            +
                def to_unicode_required?
         | 
| 80 | 
            +
                  @to_unicode_required
         | 
| 81 | 
            +
                end
         | 
| 82 | 
            +
             | 
| 77 83 | 
             
                # set the differences table for this encoding. should be an array in the following format:
         | 
| 78 84 | 
             
                #
         | 
| 79 85 | 
             
                #   [25, :A, 26, :B]
         | 
| @@ -101,25 +107,22 @@ class PDF::Reader | |
| 101 107 |  | 
| 102 108 | 
             
                # convert the specified string to utf8
         | 
| 103 109 | 
             
                def to_utf8(str, tounicode = nil)
         | 
| 104 | 
            -
             | 
| 105 110 | 
             
                  # unpack the single bytes
         | 
| 106 | 
            -
                  array_orig = str.unpack( | 
| 111 | 
            +
                  array_orig = str.unpack(unpack)
         | 
| 107 112 |  | 
| 108 113 | 
             
                  # replace any relevant bytes with a glyph name
         | 
| 109 114 | 
             
                  array_orig = process_differences(array_orig)
         | 
| 110 115 |  | 
| 111 116 | 
             
                  # replace any remaining bytes with a unicode codepoint
         | 
| 112 | 
            -
                  array_enc =  | 
| 113 | 
            -
                  array_orig.each do |num|
         | 
| 117 | 
            +
                  array_enc = array_orig.map do |num|
         | 
| 114 118 | 
             
                    if tounicode && (code = tounicode.decode(num))
         | 
| 115 | 
            -
                       | 
| 116 | 
            -
                    elsif tounicode || ( tounicode.nil? &&  | 
| 117 | 
            -
             | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
| 120 | 
            -
                      array_enc << @mapping[num]
         | 
| 119 | 
            +
                      code
         | 
| 120 | 
            +
                    elsif tounicode || ( tounicode.nil? && to_unicode_required? )
         | 
| 121 | 
            +
                      PDF::Reader::Encoding::UNKNOWN_CHAR
         | 
| 122 | 
            +
                    elsif mapping[num]
         | 
| 123 | 
            +
                      mapping[num]
         | 
| 121 124 | 
             
                    else
         | 
| 122 | 
            -
                       | 
| 125 | 
            +
                      num
         | 
| 123 126 | 
             
                    end
         | 
| 124 127 | 
             
                  end
         | 
| 125 128 |  | 
| @@ -140,6 +143,14 @@ class PDF::Reader | |
| 140 143 |  | 
| 141 144 | 
             
                private
         | 
| 142 145 |  | 
| 146 | 
            +
                def mapping
         | 
| 147 | 
            +
                  @mapping ||= {}
         | 
| 148 | 
            +
                end
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                def has_mapping?
         | 
| 151 | 
            +
                  mapping.size > 0
         | 
| 152 | 
            +
                end
         | 
| 153 | 
            +
             | 
| 143 154 | 
             
                # accepts an array of byte numbers, and replaces any that have entries in the differences table
         | 
| 144 155 | 
             
                # with a glyph name
         | 
| 145 156 | 
             
                def process_differences(arr)
         | 
| @@ -154,12 +165,13 @@ class PDF::Reader | |
| 154 165 | 
             
                end
         | 
| 155 166 |  | 
| 156 167 | 
             
                def load_mapping(file)
         | 
| 157 | 
            -
                   | 
| 168 | 
            +
                  return if has_mapping?
         | 
| 169 | 
            +
             | 
| 158 170 | 
             
                  RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
         | 
| 159 171 | 
             
                  File.open(file, mode) do |f|
         | 
| 160 172 | 
             
                    f.each do |l|
         | 
| 161 173 | 
             
                      m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
         | 
| 162 | 
            -
                       | 
| 174 | 
            +
                      mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
         | 
| 163 175 | 
             
                    end
         | 
| 164 176 | 
             
                  end
         | 
| 165 177 | 
             
                end
         | 
    
        data/lib/pdf/reader/parser.rb
    CHANGED
    
    | @@ -33,7 +33,7 @@ class PDF::Reader | |
| 33 33 | 
             
                #
         | 
| 34 34 | 
             
                # buffer - a PDF::Reader::Buffer object that contains PDF data
         | 
| 35 35 | 
             
                # xref   - a PDF::Reader::XRef object that represents the document's object offsets
         | 
| 36 | 
            -
                def initialize (buffer, xref)
         | 
| 36 | 
            +
                def initialize (buffer, xref=nil)
         | 
| 37 37 | 
             
                  @buffer = buffer
         | 
| 38 38 | 
             
                  @xref   = xref
         | 
| 39 39 | 
             
                end
         | 
| @@ -48,7 +48,7 @@ class PDF::Reader | |
| 48 48 | 
             
                  case token
         | 
| 49 49 | 
             
                  when PDF::Reader::Reference     then return token
         | 
| 50 50 | 
             
                  when nil                        then return nil
         | 
| 51 | 
            -
                  when "/"                        then return  | 
| 51 | 
            +
                  when "/"                        then return pdf_name()
         | 
| 52 52 | 
             
                  when "<<"                       then return dictionary()
         | 
| 53 53 | 
             
                  when "["                        then return array()
         | 
| 54 54 | 
             
                  when "("                        then return string()
         | 
| @@ -107,6 +107,16 @@ class PDF::Reader | |
| 107 107 | 
             
                  dict
         | 
| 108 108 | 
             
                end
         | 
| 109 109 | 
             
                ################################################################################
         | 
| 110 | 
            +
                # reads a PDF name from the buffer and converts it to a Ruby Symbol
         | 
| 111 | 
            +
                def pdf_name
         | 
| 112 | 
            +
                  tok = @buffer.token
         | 
| 113 | 
            +
                  tok.scan(/#(\d\d)/).each do |find|
         | 
| 114 | 
            +
                    replace = find[0].hex.chr
         | 
| 115 | 
            +
                    tok.gsub!("#"+find[0], replace)
         | 
| 116 | 
            +
                  end
         | 
| 117 | 
            +
                  tok.to_sym
         | 
| 118 | 
            +
                end
         | 
| 119 | 
            +
                ################################################################################
         | 
| 110 120 | 
             
                # reads a PDF array from the buffer and converts it to a Ruby Array.
         | 
| 111 121 | 
             
                def array
         | 
| 112 122 | 
             
                  a = []
         | 
| @@ -141,6 +151,7 @@ class PDF::Reader | |
| 141 151 | 
             
                  return "" if str == ")"
         | 
| 142 152 | 
             
                  Error.assert_equal(parse_token, ")")
         | 
| 143 153 |  | 
| 154 | 
            +
                  str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
         | 
| 144 155 | 
             
                  str.gsub!("\\n","\n")
         | 
| 145 156 | 
             
                  str.gsub!("\\r","\r")
         | 
| 146 157 | 
             
                  str.gsub!("\\t","\t")
         | 
| @@ -150,7 +161,6 @@ class PDF::Reader | |
| 150 161 | 
             
                  str.gsub!("\\)",")")
         | 
| 151 162 | 
             
                  str.gsub!("\\\\","\\")
         | 
| 152 163 | 
             
                  str.gsub!(/\\\n/m,"")
         | 
| 153 | 
            -
                  str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
         | 
| 154 164 |  | 
| 155 165 | 
             
                  str.scan(/\\\d{1,3}/).each do |octal|
         | 
| 156 166 | 
             
                    str.gsub!(octal, octal[1,3].oct.chr)
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: pdf-reader
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 0.8. | 
| 4 | 
            +
              version: 0.8.4
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 7 | 
             
            - James Healy
         | 
| @@ -9,7 +9,7 @@ autorequire: | |
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 11 |  | 
| 12 | 
            -
            date: 2010- | 
| 12 | 
            +
            date: 2010-03-30 00:00:00 +05:30
         | 
| 13 13 | 
             
            default_executable: 
         | 
| 14 14 | 
             
            dependencies: 
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -43,6 +43,7 @@ files: | |
| 43 43 | 
             
            - examples/hash.rb
         | 
| 44 44 | 
             
            - examples/callbacks.rb
         | 
| 45 45 | 
             
            - examples/text.rb
         | 
| 46 | 
            +
            - examples/version.rb
         | 
| 46 47 | 
             
            - examples/page_counter_improved.rb
         | 
| 47 48 | 
             
            - lib/pdf/reader/glyphlist.txt
         | 
| 48 49 | 
             
            - lib/pdf/reader/content.rb
         | 
| @@ -70,6 +71,7 @@ files: | |
| 70 71 | 
             
            - lib/pdf/reader/parser.rb
         | 
| 71 72 | 
             
            - lib/pdf/hash.rb
         | 
| 72 73 | 
             
            - lib/pdf/reader.rb
         | 
| 74 | 
            +
            - lib/pdf-reader.rb
         | 
| 73 75 | 
             
            - Rakefile
         | 
| 74 76 | 
             
            - README.rdoc
         | 
| 75 77 | 
             
            - TODO
         |