pdfbeads 1.0.5 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -25,4 +25,17 @@
25
25
  JPEG files were still written with the 'JP2' extension.
26
26
 
27
27
  * Some tweaks to minimize the effect of page labels being inconsistently handled
28
- in various PDF viewers (prefer ISO-8859-1 strings if possible).
28
+ in various PDF viewers (prefer ISO-8859-1 strings if possible).
29
+
30
+ 2012 February 5 (Alexey Kryukov) Version 1.0.6
31
+
32
+ + Add Greek letters (the monotonic set) to the list of characters with hardcoded
33
+ glyph names and width.
34
+
35
+ * Minor bugs fixed.
36
+
37
+ 2012 February 10 (Alexey Kryukov) Version 1.0.7
38
+
39
+ + An attempt to achive better positioning of the hidden text layer, taking into
40
+ account not just lines, but also individual words. This should work with hOCR
41
+ files produced with Cuneiform or Tesseract.
@@ -144,7 +144,7 @@ OptionParser.new() do |opts|
144
144
  opts.on("-b", "--bg-compression FORMAT",
145
145
  ['JP2', 'JPX', 'J2K', 'JPEG2000', 'JPG', 'JPEG', 'LOSSLESS', 'PNG', 'DEFLATE'],
146
146
  "Compression method for background images. Acceptable",
147
- "values are JP2|JPX|JPEG2000, JPG|JPEG or LOSSLESS.",
147
+ "values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
148
148
  "JP2 is used by default, unless this format is not",
149
149
  "supported by the available version of ImageMagick" ) do |format|
150
150
  case format.upcase
@@ -357,7 +357,7 @@ class PDFBeads::PDFBuilder
357
357
  fin.each do |fl|
358
358
  next if /^\#/.match( fl )
359
359
 
360
- if /^\/?([A-Za-z]+)[ ]*:[ ]+\"(.*)\"/.match( fl )
360
+ if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
361
361
  key = $1
362
362
  if keys.include? key
363
363
  begin
@@ -390,8 +390,8 @@ class PDFBeads::PDFBuilder
390
390
  end
391
391
 
392
392
  item_text = item[:title].to_binary
393
- item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
394
- item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
393
+ item_text.sub!( /\x28/,"\x5C\x28" )
394
+ item_text.sub!( /\x29/,"\x5C\x29" )
395
395
  item[:pdfobj] = XObj.new(Hash[
396
396
  'Title' => "(\xFE\xFF#{item_text.to_text})",
397
397
  'Parent' => ref(item[:parent][:pdfobj].getID),
@@ -442,10 +442,95 @@ class PDFBeads::PDFBuilder
442
442
  return out
443
443
  end
444
444
 
445
+ def elementText( elem,charset )
446
+ txt = ''
447
+ begin
448
+ txt = elem.to_plain_text.strip
449
+ txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
450
+ rescue
451
+ end
452
+
453
+ txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
454
+ return txt
455
+ end
456
+
457
+ def getOCRUnits( ocr_line,lbbox,fsize,charset,xscale,yscale )
458
+ units = Array.new()
459
+ ocr_words = ocr_line.search("//span[@class='ocrx_word']")
460
+ ocr_chars = nil
461
+ ocr_chars = ocr_line.at("//span[@class='ocr_cinfo']") if ocr_words.length == 0
462
+
463
+ # If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
464
+ # into individual words
465
+ if ocr_words.length > 0
466
+ ocr_words.each do |word|
467
+ bbox = elementCoordinates( word,xscale,yscale )
468
+ txt = elementText( word,charset )
469
+ units << [txt,bbox]
470
+ end
471
+
472
+ # If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
473
+ # into individual characters and then combine them into words
474
+ elsif not ocr_chars.nil? and ocr_chars.attributes.to_hash.has_key? 'title'
475
+ if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes.to_hash['title'] )
476
+ coords = $1.strip.split(/\s+/)
477
+ ltxt = elementText( ocr_line,charset )
478
+ charcnt = 0
479
+ ltxt.each_char { |uc| charcnt += 1 }
480
+
481
+ if charcnt <= coords.length/4
482
+ i = 0
483
+ wtxt = ''
484
+ bbox = [-1,-1,-1,-1]
485
+ ltxt.each_char do |uc|
486
+ cbbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
487
+ (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
488
+
489
+ unless cbbox[0] < 0
490
+ bbox[0] = cbbox[0] if cbbox[0] < bbox[0] or bbox[0] < 0
491
+ bbox[1] = cbbox[1] if cbbox[1] < bbox[1] or bbox[1] < 0
492
+ bbox[2] = cbbox[2] if cbbox[2] > bbox[2] or bbox[2] < 0
493
+ bbox[3] = cbbox[3] if cbbox[3] > bbox[3] or bbox[3] < 0
494
+ wtxt << uc
495
+
496
+ else
497
+ units << [wtxt,bbox]
498
+ bbox = [-1,-1,-1,-1]
499
+ if /^\s+$/.match( uc )
500
+ wtxt = ''
501
+
502
+ # A workaround for probable hpricot bug, which sometimes causes whitespace
503
+ # characters from inside a string to be stripped. So if we find
504
+ # a bounding box with negative values we assume there was a whitespace
505
+ # character here, even if not preserved in the string itself
506
+ else
507
+ wtxt = uc
508
+ i += 1
509
+ bbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
510
+ (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
511
+ end
512
+ end
513
+ i += 1
514
+ end
515
+ units << [wtxt,bbox] unless wtxt.eql? ''
516
+ end
517
+ end
518
+ end
519
+
520
+ # If neither word nor character bounding boxes are available, then store the line as a whole
521
+ if units.length == 0
522
+ ltxt = elementText( ocr_line,charset )
523
+ units << [ltxt,lbbox] unless ltxt.eql? ''
524
+ end
525
+
526
+ units[units.length-1][0].sub!( /-\Z/, "\xC2\xAD" ) unless units.length == 0
527
+ return units
528
+ end
529
+
445
530
  def getPDFText( hocr,pheight,xscale,yscale,encodings )
446
531
  fsize = 10
447
- cur_enc = encodings[0]
448
- ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
532
+ cur_enc = nil
533
+ ret = " BT 3 Tr "
449
534
 
450
535
  charset = 'utf-8'
451
536
  hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
@@ -455,71 +540,107 @@ class PDFBeads::PDFBuilder
455
540
  end
456
541
 
457
542
  hocr.search("//span[@class='ocr_line']").each do |line|
458
- txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
459
- begin
460
- txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
461
- rescue
462
- txt = ''
543
+ lbbox = elementCoordinates( line,xscale,yscale )
544
+ next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
545
+ units = getOCRUnits( line,lbbox,fsize,charset,xscale,yscale )
546
+ next if units.length == 0
547
+
548
+ wwidth = 0
549
+ ltxt = ''
550
+ units.each do |unit|
551
+ ltxt << unit[0]
552
+ wwidth += ( unit[1][2] - unit[1][0] )
463
553
  end
464
- next if txt.eql? ''
465
- txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
466
- txt.sub!( /-\Z/, "\xC2\xAD" )
554
+ ratio = wwidth / @fdata.getLineWidth( ltxt,fsize )
555
+ pos = lbbox[0]
556
+ posdiff = 0
467
557
 
468
- bbox = elementCoordinates( line,xscale,yscale )
469
- ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
470
558
  ret << sprintf( "%f %f %f %f %f %f Tm ",
471
- ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
472
-
473
- txt8 = ''
474
- txt.each_char do |char|
475
- begin
476
- Iconv.iconv( "utf-16be","utf-8",char )
477
- rescue
478
- rawbytes = char.unpack( 'C*' )
479
- bs = ''
480
- rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
481
- $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
482
- char = '?' * rawbytes.length
483
- end
559
+ ratio, 0, 0, ratio, lbbox[0], pheight - lbbox[3] - @fdata.header['Descent'] * fsize / 1000.0 * ratio)
560
+ in_txt = false
484
561
 
485
- encoded = false
486
- unless cur_enc.include? char
487
- encodings.each_index do |i|
488
- enc = encodings[i]
489
- next if enc == cur_enc
490
-
491
- if enc.include? char
492
- ret << "<#{txt8}> Tj "
493
- cur_enc = enc
494
- ret << "/Fnt#{i + 1} #{fsize} Tf "
495
- txt8 = ''
496
- encoded = true
497
- break
498
- end
562
+ units.each_index do |i|
563
+ unit = units[i]
564
+ wtxt = unit[0]
565
+ bbox = unit[1]
566
+
567
+ posdiff = ( (pos - bbox[0]) * 1000 / fsize / ratio ).to_i if i > 0
568
+ pos = bbox[0] + ( @fdata.getLineWidth( wtxt,fsize ) * ratio )
569
+
570
+ txt8 = ''
571
+ wtxt.each_char do |char|
572
+ begin
573
+ Iconv.iconv( "utf-16be","utf-8",char )
574
+ rescue
575
+ rawbytes = char.unpack( 'C*' )
576
+ bs = ''
577
+ rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
578
+ $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
579
+ char = '?' * rawbytes.length
499
580
  end
500
581
 
501
- unless encoded
502
- last = encodings[-1]
503
- if last.length < 256
504
- last << char
505
- else
506
- last = [ ' ',char ]
507
- encodings << last
582
+ encoded = false
583
+ if cur_enc.nil? or not cur_enc.include? char
584
+ encodings.each_index do |i|
585
+ enc = encodings[i]
586
+ next if enc == cur_enc
587
+
588
+ if enc.include? char
589
+ if in_txt
590
+ ret << "#{posdiff} " if posdiff != 0
591
+ ret << "<#{txt8}> " unless txt8.eql? ''
592
+ ret << "] TJ "
593
+ end
594
+ cur_enc = enc
595
+ ret << "/Fnt#{i + 1} #{fsize} Tf "
596
+ txt8 = ''
597
+ posdiff = 0
598
+ encoded = true
599
+ in_txt = false
600
+ break
601
+ end
508
602
  end
509
603
 
510
- if cur_enc != last
511
- ret << "<#{txt8}> Tj "
512
- cur_enc = last
513
- ret << "/Fnt#{encodings.length} #{fsize} Tf "
514
- txt8 = ''
604
+ unless encoded
605
+ last = encodings[-1]
606
+ if last.length < 256
607
+ last << char
608
+ else
609
+ last = [ ' ',char ]
610
+ encodings << last
611
+ end
612
+
613
+ if cur_enc != last
614
+ if in_txt
615
+ ret << "#{posdiff} " if posdiff != 0
616
+ ret << "<#{txt8}> " unless txt8.eql? ''
617
+ ret << "] TJ "
618
+ end
619
+ cur_enc = last
620
+ ret << "/Fnt#{encodings.length} #{fsize} Tf "
621
+ txt8 = ''
622
+ posdiff = 0
623
+ in_txt = false
624
+ end
515
625
  end
516
626
  end
627
+
628
+ unless in_txt
629
+ ret << "[ "
630
+ in_txt = true
631
+ end
632
+ txt8 << sprintf( "%02X",cur_enc.index(char) )
517
633
  end
518
634
 
519
- txt8 << sprintf( "%02X",cur_enc.index(char) )
635
+ unless txt8.eql? ''
636
+ ret << "#{posdiff} " if posdiff != 0
637
+ ret << "<#{txt8}> "
638
+ end
639
+ end
640
+ if in_txt
641
+ ret << "] TJ "
642
+ in_txt = false
520
643
  end
521
-
522
- ret << "<#{txt8}> Tj " unless txt8.eql? ''
523
644
  end
524
645
 
525
646
  ret << "ET "
@@ -266,7 +266,75 @@ class PDFBeads::PDFBuilder::FontDataProvider
266
266
  0x02DB => ["/ogonek", 333],
267
267
  0x02DC => ["/tilde", 333],
268
268
  0x02DD => ["/hungarumlaut", 333],
269
+ 0x0338 => ["/Alphatonos", 722],
270
+ 0x0388 => ["/Epsilontonos", 694],
271
+ 0x0389 => ["/Etatonos", 808],
272
+ 0x038A => ["/Iotatonos", 412],
273
+ 0x038C => ["/Omicrontonos", 722],
274
+ 0x038E => ["/Upsilontonos", 816],
275
+ 0x038F => ["/Omegatonos", 744],
276
+ 0x03AC => ["/alphatonos", 522],
277
+ 0x03AD => ["/epsilontonos", 420],
278
+ 0x03AE => ["/etatonos", 522],
279
+ 0x03AF => ["/iotatonos", 268],
280
+ 0x0390 => ["/iotadieresistonos", 268],
281
+ 0x0391 => ["/Alpha", 722],
282
+ 0x0392 => ["/Beta", 667],
283
+ 0x0393 => ["/Gamma", 578],
269
284
  0x0394 => ["/Delta", 643],
285
+ 0x0395 => ["/Epsilon", 611],
286
+ 0x0396 => ["/Zeta", 611],
287
+ 0x0397 => ["/Eta", 722],
288
+ 0x0398 => ["/Theta", 722],
289
+ 0x0399 => ["/Iota", 333],
290
+ 0x039A => ["/Kappa", 722],
291
+ 0x039B => ["/Lambda", 724],
292
+ 0x039C => ["/Mu", 889],
293
+ 0x039D => ["/Nu", 722],
294
+ 0x039E => ["/Xi", 643],
295
+ 0x039F => ["/Omicron", 722],
296
+ 0x03A0 => ["/Pi", 722],
297
+ 0x03A1 => ["/Rho", 556],
298
+ 0x03A3 => ["/Sigma", 582],
299
+ 0x03A4 => ["/Tau", 611],
300
+ 0x03A5 => ["/Upsilon", 722],
301
+ 0x03A6 => ["/Phi", 730],
302
+ 0x03A7 => ["/Chi", 722],
303
+ 0x03A8 => ["/Psi", 737],
304
+ 0x03A9 => ["/Omega", 744],
305
+ 0x03AA => ["/Iotadieresis", 333],
306
+ 0x03AB => ["/Upsilondieresis", 722],
307
+ 0x03B0 => ["/upsilondieresistonos", 496],
308
+ 0x03B1 => ["/alpha", 522],
309
+ 0x03B2 => ["/beta", 508],
310
+ 0x03B3 => ["/gamma", 440],
311
+ 0x03B4 => ["/delta", 471],
312
+ 0x03B5 => ["/epsilon", 420],
313
+ 0x03B6 => ["/zeta", 414],
314
+ 0x03B7 => ["/eta", 522],
315
+ 0x03B8 => ["/theta", 480],
316
+ 0x03B9 => ["/iota", 268],
317
+ 0x03BA => ["/kappa", 502],
318
+ 0x03BB => ["/lambda", 484],
319
+ 0x03BC => ["/mu", 500],
320
+ 0x03BD => ["/nu", 452],
321
+ 0x03BE => ["/xi", 444],
322
+ 0x03BF => ["/omicron", 500],
323
+ 0x03C0 => ["/pi", 504],
324
+ 0x03C1 => ["/rho", 500],
325
+ 0x03C2 => ["/sigma1", 396],
326
+ 0x03C3 => ["/sigma", 540],
327
+ 0x03C4 => ["/tau", 400],
328
+ 0x03C5 => ["/upsilon", 496],
329
+ 0x03C6 => ["/phi", 578],
330
+ 0x03C7 => ["/chi", 444],
331
+ 0x03C8 => ["/psi", 624],
332
+ 0x03C9 => ["/omega", 658],
333
+ 0x03CA => ["/iotadieresis", 268],
334
+ 0x03CB => ["/upsilondieresis", 496],
335
+ 0x03CC => ["/omicrontonos", 500],
336
+ 0x03CD => ["/upsilontonos", 496],
337
+ 0x03CE => ["/omegatonos", 658],
270
338
  0x0401 => ["/afii10023", 611],
271
339
  0x0402 => ["/afii10051", 752],
272
340
  0x0403 => ["/afii10052", 578],
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfbeads
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 5
10
- version: 1.0.5
9
+ - 7
10
+ version: 1.0.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alexey Kryukov
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-02-04 00:00:00 +04:00
18
+ date: 2012-02-10 00:00:00 +04:00
19
19
  default_executable: pdfbeads
20
20
  dependencies: []
21
21