RubyGems - pdfbeads - Versions diffs - 1.0.5 → 1.0.7 - Mend

pdfbeads 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/ChangeLog +14 -1
data/bin/pdfbeads +1 -1
data/lib/pdfbeads/pdfbuilder.rb +178 -57
data/lib/pdfbeads/pdffont.rb +68 -0
metadata +4 -4

data/ChangeLog CHANGED

@@ -25,4 +25,17 @@
       JPEG files were still written with the 'JP2' extension.
     * Some tweaks to minimize the effect of page labels being inconsistently handled
-      in various PDF viewers (prefer ISO-8859-1 strings if possible).
+      in various PDF viewers (prefer ISO-8859-1 strings if possible).
+2012 February 5 (Alexey Kryukov) Version 1.0.6
+    + Add Greek letters (the monotonic set) to the list of characters with hardcoded
+      glyph names and width.
+    * Minor bugs fixed.
+2012 February 10 (Alexey Kryukov) Version 1.0.7
+    + An attempt to achive better positioning of the hidden text layer, taking into
+      account not just lines, but also individual words. This should work with hOCR
+      files produced with Cuneiform or Tesseract.

data/bin/pdfbeads CHANGED

@@ -144,7 +144,7 @@ OptionParser.new() do |opts|
   opts.on("-b", "--bg-compression FORMAT",
                 ['JP2', 'JPX', 'J2K', 'JPEG2000', 'JPG', 'JPEG', 'LOSSLESS', 'PNG', 'DEFLATE'],
                 "Compression method for background images. Acceptable",
-                "values are JP2|JPX|JPEG2000, JPG|JPEG or LOSSLESS.",
+                "values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
                 "JP2 is used by default, unless this format is not",
                 "supported by the available version of ImageMagick" ) do |format|
     case format.upcase

data/lib/pdfbeads/pdfbuilder.rb CHANGED

@@ -357,7 +357,7 @@ class PDFBeads::PDFBuilder
       fin.each do |fl|
         next if /^\#/.match( fl )
-        if /^\/?([A-Za-z]+)[         ]*:[         ]+\"(.*)\"/.match( fl )
+        if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
           key = $1
           if keys.include? key
             begin
@@ -390,8 +390,8 @@ class PDFBeads::PDFBuilder
       end
       item_text = item[:title].to_binary
-      item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
-      item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
+      item_text.sub!( /\x28/,"\x5C\x28" )
+      item_text.sub!( /\x29/,"\x5C\x29" )
       item[:pdfobj] = XObj.new(Hash[
         'Title'  => "(\xFE\xFF#{item_text.to_text})",
         'Parent' => ref(item[:parent][:pdfobj].getID),
@@ -442,10 +442,95 @@ class PDFBeads::PDFBuilder
     return out
   end
+  def elementText( elem,charset )
+    txt = ''
+    begin
+      txt = elem.to_plain_text.strip
+      txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
+    rescue
+    end
+    txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
+    return txt
+  end
+  def getOCRUnits( ocr_line,lbbox,fsize,charset,xscale,yscale )
+    units = Array.new()
+    ocr_words = ocr_line.search("//span[@class='ocrx_word']")
+    ocr_chars = nil
+    ocr_chars = ocr_line.at("//span[@class='ocr_cinfo']") if ocr_words.length == 0
+    # If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
+    # into individual words
+    if ocr_words.length > 0
+      ocr_words.each do |word|
+        bbox = elementCoordinates( word,xscale,yscale )
+        txt = elementText( word,charset )
+        units << [txt,bbox]
+      end
+    # If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
+    # into individual characters and then combine them into words
+    elsif not ocr_chars.nil? and ocr_chars.attributes.to_hash.has_key? 'title'
+      if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes.to_hash['title'] )
+        coords = $1.strip.split(/\s+/)
+        ltxt = elementText( ocr_line,charset )
+        charcnt = 0
+        ltxt.each_char { |uc| charcnt += 1 }
+        if charcnt <= coords.length/4
+          i = 0
+          wtxt = ''
+          bbox = [-1,-1,-1,-1]
+          ltxt.each_char do |uc|
+            cbbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
+                      (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
+            unless cbbox[0] < 0
+              bbox[0] = cbbox[0] if cbbox[0] < bbox[0] or bbox[0] < 0
+              bbox[1] = cbbox[1] if cbbox[1] < bbox[1] or bbox[1] < 0
+              bbox[2] = cbbox[2] if cbbox[2] > bbox[2] or bbox[2] < 0
+              bbox[3] = cbbox[3] if cbbox[3] > bbox[3] or bbox[3] < 0
+              wtxt << uc
+            else
+              units << [wtxt,bbox]
+              bbox = [-1,-1,-1,-1]
+              if /^\s+$/.match( uc )
+                wtxt = ''
+              # A workaround for probable hpricot bug, which sometimes causes whitespace
+              # characters from inside a string to be stripped. So if we find
+              # a bounding box with negative values we assume there was a whitespace
+              # character here, even if not preserved in the string itself
+              else
+                wtxt = uc
+                i += 1
+                bbox =  [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
+                          (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
+              end
+            end
+            i += 1
+          end
+          units << [wtxt,bbox] unless wtxt.eql? ''
+        end
+      end
+    end
+    # If neither word nor character bounding boxes are available, then store the line as a whole
+    if units.length == 0
+      ltxt = elementText( ocr_line,charset )
+      units << [ltxt,lbbox] unless ltxt.eql? ''
+    end
+    units[units.length-1][0].sub!( /-\Z/, "\xC2\xAD" ) unless units.length == 0
+    return units
+  end
   def getPDFText( hocr,pheight,xscale,yscale,encodings )
     fsize = 10
-    cur_enc = encodings[0]
-    ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
+    cur_enc = nil
+    ret = " BT 3 Tr "
     charset = 'utf-8'
     hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
@@ -455,71 +540,107 @@ class PDFBeads::PDFBuilder
     end
     hocr.search("//span[@class='ocr_line']").each do |line|
-      txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
-      begin
-        txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
-      rescue
-        txt = ''
+      lbbox = elementCoordinates( line,xscale,yscale )
+      next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
+      units = getOCRUnits( line,lbbox,fsize,charset,xscale,yscale )
+      next if units.length == 0
+      wwidth = 0
+      ltxt = ''
+      units.each do |unit|
+        ltxt << unit[0]
+        wwidth += ( unit[1][2] - unit[1][0] )
       end
-      next if txt.eql? ''
-      txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
-      txt.sub!( /-\Z/, "\xC2\xAD" )
+      ratio = wwidth / @fdata.getLineWidth( ltxt,fsize )
+      pos = lbbox[0]
+      posdiff = 0
-      bbox = elementCoordinates( line,xscale,yscale )
-      ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
       ret << sprintf( "%f %f %f %f %f %f Tm ",
-        ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
-      txt8 = ''
-      txt.each_char do |char|
-        begin
-          Iconv.iconv( "utf-16be","utf-8",char )
-        rescue
-          rawbytes = char.unpack( 'C*' )
-          bs = ''
-          rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
-          $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
-          char = '?' * rawbytes.length
-        end
+        ratio, 0, 0, ratio, lbbox[0], pheight - lbbox[3] - @fdata.header['Descent'] * fsize / 1000.0 * ratio)
+      in_txt = false
-        encoded = false
-        unless cur_enc.include? char
-          encodings.each_index do |i|
-            enc = encodings[i]
-            next if enc == cur_enc
-            if enc.include? char
-              ret << "<#{txt8}> Tj "
-              cur_enc = enc
-              ret << "/Fnt#{i + 1} #{fsize} Tf "
-              txt8 = ''
-              encoded = true
-              break
-            end
+      units.each_index do |i|
+        unit = units[i]
+        wtxt = unit[0]
+        bbox = unit[1]
+        posdiff = ( (pos - bbox[0]) * 1000 / fsize / ratio ).to_i if i > 0
+        pos = bbox[0] + ( @fdata.getLineWidth( wtxt,fsize ) * ratio )
+        txt8 = ''
+        wtxt.each_char do |char|
+          begin
+            Iconv.iconv( "utf-16be","utf-8",char )
+          rescue
+            rawbytes = char.unpack( 'C*' )
+            bs = ''
+            rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
+            $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
+            char = '?' * rawbytes.length
           end
-          unless encoded
-            last = encodings[-1]
-            if last.length < 256
-              last << char
-            else
-              last = [ ' ',char ]
-              encodings << last
+          encoded = false
+          if cur_enc.nil? or not cur_enc.include? char
+            encodings.each_index do |i|
+              enc = encodings[i]
+              next if enc == cur_enc
+              if enc.include? char
+                if in_txt
+                  ret << "#{posdiff} " if posdiff != 0
+                  ret << "<#{txt8}> " unless txt8.eql? ''
+                  ret << "] TJ "
+                end
+                cur_enc = enc
+                ret << "/Fnt#{i + 1} #{fsize} Tf "
+                txt8 = ''
+                posdiff = 0
+                encoded = true
+                in_txt = false
+                break
+              end
             end
-            if cur_enc != last
-              ret << "<#{txt8}> Tj "
-              cur_enc = last
-              ret << "/Fnt#{encodings.length} #{fsize} Tf "
-              txt8 = ''
+            unless encoded
+              last = encodings[-1]
+              if last.length < 256
+                last << char
+              else
+                last = [ ' ',char ]
+                encodings << last
+              end
+              if cur_enc != last
+                if in_txt
+                  ret << "#{posdiff} " if posdiff != 0
+                  ret << "<#{txt8}> " unless txt8.eql? ''
+                  ret << "] TJ "
+                end
+                cur_enc = last
+                ret << "/Fnt#{encodings.length} #{fsize} Tf "
+                txt8 = ''
+                posdiff = 0
+                in_txt = false
+              end
             end
           end
+          unless in_txt
+            ret << "[ "
+            in_txt = true
+          end
+          txt8 << sprintf( "%02X",cur_enc.index(char) )
         end
-        txt8 << sprintf( "%02X",cur_enc.index(char) )
+        unless txt8.eql? ''
+          ret << "#{posdiff} " if posdiff != 0
+          ret << "<#{txt8}> "
+        end
+      end
+      if in_txt
+        ret << "] TJ "
+        in_txt = false
       end
-      ret << "<#{txt8}> Tj " unless txt8.eql? ''
     end
     ret << "ET "

data/lib/pdfbeads/pdffont.rb CHANGED

@@ -266,7 +266,75 @@ class PDFBeads::PDFBuilder::FontDataProvider
       0x02DB => ["/ogonek", 333],
       0x02DC => ["/tilde", 333],
       0x02DD => ["/hungarumlaut", 333],
+      0x0338 => ["/Alphatonos", 722],
+      0x0388 => ["/Epsilontonos", 694],
+      0x0389 => ["/Etatonos", 808],
+      0x038A => ["/Iotatonos", 412],
+      0x038C => ["/Omicrontonos", 722],
+      0x038E => ["/Upsilontonos", 816],
+      0x038F => ["/Omegatonos", 744],
+      0x03AC => ["/alphatonos", 522],
+      0x03AD => ["/epsilontonos", 420],
+      0x03AE => ["/etatonos", 522],
+      0x03AF => ["/iotatonos", 268],
+      0x0390 => ["/iotadieresistonos", 268],
+      0x0391 => ["/Alpha", 722],
+      0x0392 => ["/Beta", 667],
+      0x0393 => ["/Gamma", 578],
       0x0394 => ["/Delta", 643],
+      0x0395 => ["/Epsilon", 611],
+      0x0396 => ["/Zeta", 611],
+      0x0397 => ["/Eta", 722],
+      0x0398 => ["/Theta", 722],
+      0x0399 => ["/Iota", 333],
+      0x039A => ["/Kappa", 722],
+      0x039B => ["/Lambda", 724],
+      0x039C => ["/Mu", 889],
+      0x039D => ["/Nu", 722],
+      0x039E => ["/Xi", 643],
+      0x039F => ["/Omicron", 722],
+      0x03A0 => ["/Pi", 722],
+      0x03A1 => ["/Rho", 556],
+      0x03A3 => ["/Sigma", 582],
+      0x03A4 => ["/Tau", 611],
+      0x03A5 => ["/Upsilon", 722],
+      0x03A6 => ["/Phi", 730],
+      0x03A7 => ["/Chi", 722],
+      0x03A8 => ["/Psi", 737],
+      0x03A9 => ["/Omega", 744],
+      0x03AA => ["/Iotadieresis", 333],
+      0x03AB => ["/Upsilondieresis", 722],
+      0x03B0 => ["/upsilondieresistonos", 496],
+      0x03B1 => ["/alpha", 522],
+      0x03B2 => ["/beta", 508],
+      0x03B3 => ["/gamma", 440],
+      0x03B4 => ["/delta", 471],
+      0x03B5 => ["/epsilon", 420],
+      0x03B6 => ["/zeta", 414],
+      0x03B7 => ["/eta", 522],
+      0x03B8 => ["/theta", 480],
+      0x03B9 => ["/iota", 268],
+      0x03BA => ["/kappa", 502],
+      0x03BB => ["/lambda", 484],
+      0x03BC => ["/mu", 500],
+      0x03BD => ["/nu", 452],
+      0x03BE => ["/xi", 444],
+      0x03BF => ["/omicron", 500],
+      0x03C0 => ["/pi", 504],
+      0x03C1 => ["/rho", 500],
+      0x03C2 => ["/sigma1", 396],
+      0x03C3 => ["/sigma", 540],
+      0x03C4 => ["/tau", 400],
+      0x03C5 => ["/upsilon", 496],
+      0x03C6 => ["/phi", 578],
+      0x03C7 => ["/chi", 444],
+      0x03C8 => ["/psi", 624],
+      0x03C9 => ["/omega", 658],
+      0x03CA => ["/iotadieresis", 268],
+      0x03CB => ["/upsilondieresis", 496],
+      0x03CC => ["/omicrontonos", 500],
+      0x03CD => ["/upsilontonos", 496],
+      0x03CE => ["/omegatonos", 658],
       0x0401 => ["/afii10023", 611],
       0x0402 => ["/afii10051", 752],
       0x0403 => ["/afii10052", 578],

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: pdfbeads
 version: !ruby/object:Gem::Version
-  hash: 29
+  hash: 25
   prerelease:
   segments:
   - 1
   - 0
-  - 5
-  version: 1.0.5
+  - 7
+  version: 1.0.7
 platform: ruby
 authors:
 - Alexey Kryukov
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-02-04 00:00:00 +04:00
+date: 2012-02-10 00:00:00 +04:00
 default_executable: pdfbeads
 dependencies: []