pdfbeads 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -25,4 +25,17 @@
25
25
  JPEG files were still written with the 'JP2' extension.
26
26
 
27
27
  * Some tweaks to minimize the effect of page labels being inconsistently handled
28
- in various PDF viewers (prefer ISO-8859-1 strings if possible).
28
+ in various PDF viewers (prefer ISO-8859-1 strings if possible).
29
+
30
+ 2012 February 5 (Alexey Kryukov) Version 1.0.6
31
+
32
+ + Add Greek letters (the monotonic set) to the list of characters with hardcoded
33
+ glyph names and width.
34
+
35
+ * Minor bugs fixed.
36
+
37
+ 2012 February 10 (Alexey Kryukov) Version 1.0.7
38
+
39
+ + An attempt to achive better positioning of the hidden text layer, taking into
40
+ account not just lines, but also individual words. This should work with hOCR
41
+ files produced with Cuneiform or Tesseract.
@@ -144,7 +144,7 @@ OptionParser.new() do |opts|
144
144
  opts.on("-b", "--bg-compression FORMAT",
145
145
  ['JP2', 'JPX', 'J2K', 'JPEG2000', 'JPG', 'JPEG', 'LOSSLESS', 'PNG', 'DEFLATE'],
146
146
  "Compression method for background images. Acceptable",
147
- "values are JP2|JPX|JPEG2000, JPG|JPEG or LOSSLESS.",
147
+ "values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
148
148
  "JP2 is used by default, unless this format is not",
149
149
  "supported by the available version of ImageMagick" ) do |format|
150
150
  case format.upcase
@@ -357,7 +357,7 @@ class PDFBeads::PDFBuilder
357
357
  fin.each do |fl|
358
358
  next if /^\#/.match( fl )
359
359
 
360
- if /^\/?([A-Za-z]+)[ ]*:[ ]+\"(.*)\"/.match( fl )
360
+ if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
361
361
  key = $1
362
362
  if keys.include? key
363
363
  begin
@@ -390,8 +390,8 @@ class PDFBeads::PDFBuilder
390
390
  end
391
391
 
392
392
  item_text = item[:title].to_binary
393
- item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
394
- item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
393
+ item_text.sub!( /\x28/,"\x5C\x28" )
394
+ item_text.sub!( /\x29/,"\x5C\x29" )
395
395
  item[:pdfobj] = XObj.new(Hash[
396
396
  'Title' => "(\xFE\xFF#{item_text.to_text})",
397
397
  'Parent' => ref(item[:parent][:pdfobj].getID),
@@ -442,10 +442,95 @@ class PDFBeads::PDFBuilder
442
442
  return out
443
443
  end
444
444
 
445
+ def elementText( elem,charset )
446
+ txt = ''
447
+ begin
448
+ txt = elem.to_plain_text.strip
449
+ txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
450
+ rescue
451
+ end
452
+
453
+ txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
454
+ return txt
455
+ end
456
+
457
+ def getOCRUnits( ocr_line,lbbox,fsize,charset,xscale,yscale )
458
+ units = Array.new()
459
+ ocr_words = ocr_line.search("//span[@class='ocrx_word']")
460
+ ocr_chars = nil
461
+ ocr_chars = ocr_line.at("//span[@class='ocr_cinfo']") if ocr_words.length == 0
462
+
463
+ # If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
464
+ # into individual words
465
+ if ocr_words.length > 0
466
+ ocr_words.each do |word|
467
+ bbox = elementCoordinates( word,xscale,yscale )
468
+ txt = elementText( word,charset )
469
+ units << [txt,bbox]
470
+ end
471
+
472
+ # If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
473
+ # into individual characters and then combine them into words
474
+ elsif not ocr_chars.nil? and ocr_chars.attributes.to_hash.has_key? 'title'
475
+ if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes.to_hash['title'] )
476
+ coords = $1.strip.split(/\s+/)
477
+ ltxt = elementText( ocr_line,charset )
478
+ charcnt = 0
479
+ ltxt.each_char { |uc| charcnt += 1 }
480
+
481
+ if charcnt <= coords.length/4
482
+ i = 0
483
+ wtxt = ''
484
+ bbox = [-1,-1,-1,-1]
485
+ ltxt.each_char do |uc|
486
+ cbbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
487
+ (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
488
+
489
+ unless cbbox[0] < 0
490
+ bbox[0] = cbbox[0] if cbbox[0] < bbox[0] or bbox[0] < 0
491
+ bbox[1] = cbbox[1] if cbbox[1] < bbox[1] or bbox[1] < 0
492
+ bbox[2] = cbbox[2] if cbbox[2] > bbox[2] or bbox[2] < 0
493
+ bbox[3] = cbbox[3] if cbbox[3] > bbox[3] or bbox[3] < 0
494
+ wtxt << uc
495
+
496
+ else
497
+ units << [wtxt,bbox]
498
+ bbox = [-1,-1,-1,-1]
499
+ if /^\s+$/.match( uc )
500
+ wtxt = ''
501
+
502
+ # A workaround for probable hpricot bug, which sometimes causes whitespace
503
+ # characters from inside a string to be stripped. So if we find
504
+ # a bounding box with negative values we assume there was a whitespace
505
+ # character here, even if not preserved in the string itself
506
+ else
507
+ wtxt = uc
508
+ i += 1
509
+ bbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
510
+ (coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
511
+ end
512
+ end
513
+ i += 1
514
+ end
515
+ units << [wtxt,bbox] unless wtxt.eql? ''
516
+ end
517
+ end
518
+ end
519
+
520
+ # If neither word nor character bounding boxes are available, then store the line as a whole
521
+ if units.length == 0
522
+ ltxt = elementText( ocr_line,charset )
523
+ units << [ltxt,lbbox] unless ltxt.eql? ''
524
+ end
525
+
526
+ units[units.length-1][0].sub!( /-\Z/, "\xC2\xAD" ) unless units.length == 0
527
+ return units
528
+ end
529
+
445
530
  def getPDFText( hocr,pheight,xscale,yscale,encodings )
446
531
  fsize = 10
447
- cur_enc = encodings[0]
448
- ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
532
+ cur_enc = nil
533
+ ret = " BT 3 Tr "
449
534
 
450
535
  charset = 'utf-8'
451
536
  hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
@@ -455,71 +540,107 @@ class PDFBeads::PDFBuilder
455
540
  end
456
541
 
457
542
  hocr.search("//span[@class='ocr_line']").each do |line|
458
- txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
459
- begin
460
- txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
461
- rescue
462
- txt = ''
543
+ lbbox = elementCoordinates( line,xscale,yscale )
544
+ next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
545
+ units = getOCRUnits( line,lbbox,fsize,charset,xscale,yscale )
546
+ next if units.length == 0
547
+
548
+ wwidth = 0
549
+ ltxt = ''
550
+ units.each do |unit|
551
+ ltxt << unit[0]
552
+ wwidth += ( unit[1][2] - unit[1][0] )
463
553
  end
464
- next if txt.eql? ''
465
- txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
466
- txt.sub!( /-\Z/, "\xC2\xAD" )
554
+ ratio = wwidth / @fdata.getLineWidth( ltxt,fsize )
555
+ pos = lbbox[0]
556
+ posdiff = 0
467
557
 
468
- bbox = elementCoordinates( line,xscale,yscale )
469
- ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
470
558
  ret << sprintf( "%f %f %f %f %f %f Tm ",
471
- ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
472
-
473
- txt8 = ''
474
- txt.each_char do |char|
475
- begin
476
- Iconv.iconv( "utf-16be","utf-8",char )
477
- rescue
478
- rawbytes = char.unpack( 'C*' )
479
- bs = ''
480
- rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
481
- $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
482
- char = '?' * rawbytes.length
483
- end
559
+ ratio, 0, 0, ratio, lbbox[0], pheight - lbbox[3] - @fdata.header['Descent'] * fsize / 1000.0 * ratio)
560
+ in_txt = false
484
561
 
485
- encoded = false
486
- unless cur_enc.include? char
487
- encodings.each_index do |i|
488
- enc = encodings[i]
489
- next if enc == cur_enc
490
-
491
- if enc.include? char
492
- ret << "<#{txt8}> Tj "
493
- cur_enc = enc
494
- ret << "/Fnt#{i + 1} #{fsize} Tf "
495
- txt8 = ''
496
- encoded = true
497
- break
498
- end
562
+ units.each_index do |i|
563
+ unit = units[i]
564
+ wtxt = unit[0]
565
+ bbox = unit[1]
566
+
567
+ posdiff = ( (pos - bbox[0]) * 1000 / fsize / ratio ).to_i if i > 0
568
+ pos = bbox[0] + ( @fdata.getLineWidth( wtxt,fsize ) * ratio )
569
+
570
+ txt8 = ''
571
+ wtxt.each_char do |char|
572
+ begin
573
+ Iconv.iconv( "utf-16be","utf-8",char )
574
+ rescue
575
+ rawbytes = char.unpack( 'C*' )
576
+ bs = ''
577
+ rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
578
+ $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
579
+ char = '?' * rawbytes.length
499
580
  end
500
581
 
501
- unless encoded
502
- last = encodings[-1]
503
- if last.length < 256
504
- last << char
505
- else
506
- last = [ ' ',char ]
507
- encodings << last
582
+ encoded = false
583
+ if cur_enc.nil? or not cur_enc.include? char
584
+ encodings.each_index do |i|
585
+ enc = encodings[i]
586
+ next if enc == cur_enc
587
+
588
+ if enc.include? char
589
+ if in_txt
590
+ ret << "#{posdiff} " if posdiff != 0
591
+ ret << "<#{txt8}> " unless txt8.eql? ''
592
+ ret << "] TJ "
593
+ end
594
+ cur_enc = enc
595
+ ret << "/Fnt#{i + 1} #{fsize} Tf "
596
+ txt8 = ''
597
+ posdiff = 0
598
+ encoded = true
599
+ in_txt = false
600
+ break
601
+ end
508
602
  end
509
603
 
510
- if cur_enc != last
511
- ret << "<#{txt8}> Tj "
512
- cur_enc = last
513
- ret << "/Fnt#{encodings.length} #{fsize} Tf "
514
- txt8 = ''
604
+ unless encoded
605
+ last = encodings[-1]
606
+ if last.length < 256
607
+ last << char
608
+ else
609
+ last = [ ' ',char ]
610
+ encodings << last
611
+ end
612
+
613
+ if cur_enc != last
614
+ if in_txt
615
+ ret << "#{posdiff} " if posdiff != 0
616
+ ret << "<#{txt8}> " unless txt8.eql? ''
617
+ ret << "] TJ "
618
+ end
619
+ cur_enc = last
620
+ ret << "/Fnt#{encodings.length} #{fsize} Tf "
621
+ txt8 = ''
622
+ posdiff = 0
623
+ in_txt = false
624
+ end
515
625
  end
516
626
  end
627
+
628
+ unless in_txt
629
+ ret << "[ "
630
+ in_txt = true
631
+ end
632
+ txt8 << sprintf( "%02X",cur_enc.index(char) )
517
633
  end
518
634
 
519
- txt8 << sprintf( "%02X",cur_enc.index(char) )
635
+ unless txt8.eql? ''
636
+ ret << "#{posdiff} " if posdiff != 0
637
+ ret << "<#{txt8}> "
638
+ end
639
+ end
640
+ if in_txt
641
+ ret << "] TJ "
642
+ in_txt = false
520
643
  end
521
-
522
- ret << "<#{txt8}> Tj " unless txt8.eql? ''
523
644
  end
524
645
 
525
646
  ret << "ET "
@@ -266,7 +266,75 @@ class PDFBeads::PDFBuilder::FontDataProvider
266
266
  0x02DB => ["/ogonek", 333],
267
267
  0x02DC => ["/tilde", 333],
268
268
  0x02DD => ["/hungarumlaut", 333],
269
+ 0x0338 => ["/Alphatonos", 722],
270
+ 0x0388 => ["/Epsilontonos", 694],
271
+ 0x0389 => ["/Etatonos", 808],
272
+ 0x038A => ["/Iotatonos", 412],
273
+ 0x038C => ["/Omicrontonos", 722],
274
+ 0x038E => ["/Upsilontonos", 816],
275
+ 0x038F => ["/Omegatonos", 744],
276
+ 0x03AC => ["/alphatonos", 522],
277
+ 0x03AD => ["/epsilontonos", 420],
278
+ 0x03AE => ["/etatonos", 522],
279
+ 0x03AF => ["/iotatonos", 268],
280
+ 0x0390 => ["/iotadieresistonos", 268],
281
+ 0x0391 => ["/Alpha", 722],
282
+ 0x0392 => ["/Beta", 667],
283
+ 0x0393 => ["/Gamma", 578],
269
284
  0x0394 => ["/Delta", 643],
285
+ 0x0395 => ["/Epsilon", 611],
286
+ 0x0396 => ["/Zeta", 611],
287
+ 0x0397 => ["/Eta", 722],
288
+ 0x0398 => ["/Theta", 722],
289
+ 0x0399 => ["/Iota", 333],
290
+ 0x039A => ["/Kappa", 722],
291
+ 0x039B => ["/Lambda", 724],
292
+ 0x039C => ["/Mu", 889],
293
+ 0x039D => ["/Nu", 722],
294
+ 0x039E => ["/Xi", 643],
295
+ 0x039F => ["/Omicron", 722],
296
+ 0x03A0 => ["/Pi", 722],
297
+ 0x03A1 => ["/Rho", 556],
298
+ 0x03A3 => ["/Sigma", 582],
299
+ 0x03A4 => ["/Tau", 611],
300
+ 0x03A5 => ["/Upsilon", 722],
301
+ 0x03A6 => ["/Phi", 730],
302
+ 0x03A7 => ["/Chi", 722],
303
+ 0x03A8 => ["/Psi", 737],
304
+ 0x03A9 => ["/Omega", 744],
305
+ 0x03AA => ["/Iotadieresis", 333],
306
+ 0x03AB => ["/Upsilondieresis", 722],
307
+ 0x03B0 => ["/upsilondieresistonos", 496],
308
+ 0x03B1 => ["/alpha", 522],
309
+ 0x03B2 => ["/beta", 508],
310
+ 0x03B3 => ["/gamma", 440],
311
+ 0x03B4 => ["/delta", 471],
312
+ 0x03B5 => ["/epsilon", 420],
313
+ 0x03B6 => ["/zeta", 414],
314
+ 0x03B7 => ["/eta", 522],
315
+ 0x03B8 => ["/theta", 480],
316
+ 0x03B9 => ["/iota", 268],
317
+ 0x03BA => ["/kappa", 502],
318
+ 0x03BB => ["/lambda", 484],
319
+ 0x03BC => ["/mu", 500],
320
+ 0x03BD => ["/nu", 452],
321
+ 0x03BE => ["/xi", 444],
322
+ 0x03BF => ["/omicron", 500],
323
+ 0x03C0 => ["/pi", 504],
324
+ 0x03C1 => ["/rho", 500],
325
+ 0x03C2 => ["/sigma1", 396],
326
+ 0x03C3 => ["/sigma", 540],
327
+ 0x03C4 => ["/tau", 400],
328
+ 0x03C5 => ["/upsilon", 496],
329
+ 0x03C6 => ["/phi", 578],
330
+ 0x03C7 => ["/chi", 444],
331
+ 0x03C8 => ["/psi", 624],
332
+ 0x03C9 => ["/omega", 658],
333
+ 0x03CA => ["/iotadieresis", 268],
334
+ 0x03CB => ["/upsilondieresis", 496],
335
+ 0x03CC => ["/omicrontonos", 500],
336
+ 0x03CD => ["/upsilontonos", 496],
337
+ 0x03CE => ["/omegatonos", 658],
270
338
  0x0401 => ["/afii10023", 611],
271
339
  0x0402 => ["/afii10051", 752],
272
340
  0x0403 => ["/afii10052", 578],
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdfbeads
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 25
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 5
10
- version: 1.0.5
9
+ - 7
10
+ version: 1.0.7
11
11
  platform: ruby
12
12
  authors:
13
13
  - Alexey Kryukov
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-02-04 00:00:00 +04:00
18
+ date: 2012-02-10 00:00:00 +04:00
19
19
  default_executable: pdfbeads
20
20
  dependencies: []
21
21