pdfbeads 1.0.5 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +14 -1
- data/bin/pdfbeads +1 -1
- data/lib/pdfbeads/pdfbuilder.rb +178 -57
- data/lib/pdfbeads/pdffont.rb +68 -0
- metadata +4 -4
data/ChangeLog
CHANGED
@@ -25,4 +25,17 @@
|
|
25
25
|
JPEG files were still written with the 'JP2' extension.
|
26
26
|
|
27
27
|
* Some tweaks to minimize the effect of page labels being inconsistently handled
|
28
|
-
in various PDF viewers (prefer ISO-8859-1 strings if possible).
|
28
|
+
in various PDF viewers (prefer ISO-8859-1 strings if possible).
|
29
|
+
|
30
|
+
2012 February 5 (Alexey Kryukov) Version 1.0.6
|
31
|
+
|
32
|
+
+ Add Greek letters (the monotonic set) to the list of characters with hardcoded
|
33
|
+
glyph names and width.
|
34
|
+
|
35
|
+
* Minor bugs fixed.
|
36
|
+
|
37
|
+
2012 February 10 (Alexey Kryukov) Version 1.0.7
|
38
|
+
|
39
|
+
+ An attempt to achive better positioning of the hidden text layer, taking into
|
40
|
+
account not just lines, but also individual words. This should work with hOCR
|
41
|
+
files produced with Cuneiform or Tesseract.
|
data/bin/pdfbeads
CHANGED
@@ -144,7 +144,7 @@ OptionParser.new() do |opts|
|
|
144
144
|
opts.on("-b", "--bg-compression FORMAT",
|
145
145
|
['JP2', 'JPX', 'J2K', 'JPEG2000', 'JPG', 'JPEG', 'LOSSLESS', 'PNG', 'DEFLATE'],
|
146
146
|
"Compression method for background images. Acceptable",
|
147
|
-
"values are JP2|JPX|JPEG2000, JPG|JPEG or LOSSLESS.",
|
147
|
+
"values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
|
148
148
|
"JP2 is used by default, unless this format is not",
|
149
149
|
"supported by the available version of ImageMagick" ) do |format|
|
150
150
|
case format.upcase
|
data/lib/pdfbeads/pdfbuilder.rb
CHANGED
@@ -357,7 +357,7 @@ class PDFBeads::PDFBuilder
|
|
357
357
|
fin.each do |fl|
|
358
358
|
next if /^\#/.match( fl )
|
359
359
|
|
360
|
-
if /^\/?([A-Za-z]+)[
|
360
|
+
if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
|
361
361
|
key = $1
|
362
362
|
if keys.include? key
|
363
363
|
begin
|
@@ -390,8 +390,8 @@ class PDFBeads::PDFBuilder
|
|
390
390
|
end
|
391
391
|
|
392
392
|
item_text = item[:title].to_binary
|
393
|
-
item_text.sub!( /\
|
394
|
-
item_text.sub!( /\
|
393
|
+
item_text.sub!( /\x28/,"\x5C\x28" )
|
394
|
+
item_text.sub!( /\x29/,"\x5C\x29" )
|
395
395
|
item[:pdfobj] = XObj.new(Hash[
|
396
396
|
'Title' => "(\xFE\xFF#{item_text.to_text})",
|
397
397
|
'Parent' => ref(item[:parent][:pdfobj].getID),
|
@@ -442,10 +442,95 @@ class PDFBeads::PDFBuilder
|
|
442
442
|
return out
|
443
443
|
end
|
444
444
|
|
445
|
+
def elementText( elem,charset )
|
446
|
+
txt = ''
|
447
|
+
begin
|
448
|
+
txt = elem.to_plain_text.strip
|
449
|
+
txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
|
450
|
+
rescue
|
451
|
+
end
|
452
|
+
|
453
|
+
txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
|
454
|
+
return txt
|
455
|
+
end
|
456
|
+
|
457
|
+
def getOCRUnits( ocr_line,lbbox,fsize,charset,xscale,yscale )
|
458
|
+
units = Array.new()
|
459
|
+
ocr_words = ocr_line.search("//span[@class='ocrx_word']")
|
460
|
+
ocr_chars = nil
|
461
|
+
ocr_chars = ocr_line.at("//span[@class='ocr_cinfo']") if ocr_words.length == 0
|
462
|
+
|
463
|
+
# If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
|
464
|
+
# into individual words
|
465
|
+
if ocr_words.length > 0
|
466
|
+
ocr_words.each do |word|
|
467
|
+
bbox = elementCoordinates( word,xscale,yscale )
|
468
|
+
txt = elementText( word,charset )
|
469
|
+
units << [txt,bbox]
|
470
|
+
end
|
471
|
+
|
472
|
+
# If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
|
473
|
+
# into individual characters and then combine them into words
|
474
|
+
elsif not ocr_chars.nil? and ocr_chars.attributes.to_hash.has_key? 'title'
|
475
|
+
if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes.to_hash['title'] )
|
476
|
+
coords = $1.strip.split(/\s+/)
|
477
|
+
ltxt = elementText( ocr_line,charset )
|
478
|
+
charcnt = 0
|
479
|
+
ltxt.each_char { |uc| charcnt += 1 }
|
480
|
+
|
481
|
+
if charcnt <= coords.length/4
|
482
|
+
i = 0
|
483
|
+
wtxt = ''
|
484
|
+
bbox = [-1,-1,-1,-1]
|
485
|
+
ltxt.each_char do |uc|
|
486
|
+
cbbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
|
487
|
+
(coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
|
488
|
+
|
489
|
+
unless cbbox[0] < 0
|
490
|
+
bbox[0] = cbbox[0] if cbbox[0] < bbox[0] or bbox[0] < 0
|
491
|
+
bbox[1] = cbbox[1] if cbbox[1] < bbox[1] or bbox[1] < 0
|
492
|
+
bbox[2] = cbbox[2] if cbbox[2] > bbox[2] or bbox[2] < 0
|
493
|
+
bbox[3] = cbbox[3] if cbbox[3] > bbox[3] or bbox[3] < 0
|
494
|
+
wtxt << uc
|
495
|
+
|
496
|
+
else
|
497
|
+
units << [wtxt,bbox]
|
498
|
+
bbox = [-1,-1,-1,-1]
|
499
|
+
if /^\s+$/.match( uc )
|
500
|
+
wtxt = ''
|
501
|
+
|
502
|
+
# A workaround for probable hpricot bug, which sometimes causes whitespace
|
503
|
+
# characters from inside a string to be stripped. So if we find
|
504
|
+
# a bounding box with negative values we assume there was a whitespace
|
505
|
+
# character here, even if not preserved in the string itself
|
506
|
+
else
|
507
|
+
wtxt = uc
|
508
|
+
i += 1
|
509
|
+
bbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
|
510
|
+
(coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
|
511
|
+
end
|
512
|
+
end
|
513
|
+
i += 1
|
514
|
+
end
|
515
|
+
units << [wtxt,bbox] unless wtxt.eql? ''
|
516
|
+
end
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
# If neither word nor character bounding boxes are available, then store the line as a whole
|
521
|
+
if units.length == 0
|
522
|
+
ltxt = elementText( ocr_line,charset )
|
523
|
+
units << [ltxt,lbbox] unless ltxt.eql? ''
|
524
|
+
end
|
525
|
+
|
526
|
+
units[units.length-1][0].sub!( /-\Z/, "\xC2\xAD" ) unless units.length == 0
|
527
|
+
return units
|
528
|
+
end
|
529
|
+
|
445
530
|
def getPDFText( hocr,pheight,xscale,yscale,encodings )
|
446
531
|
fsize = 10
|
447
|
-
cur_enc =
|
448
|
-
ret = " BT 3 Tr
|
532
|
+
cur_enc = nil
|
533
|
+
ret = " BT 3 Tr "
|
449
534
|
|
450
535
|
charset = 'utf-8'
|
451
536
|
hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
|
@@ -455,71 +540,107 @@ class PDFBeads::PDFBuilder
|
|
455
540
|
end
|
456
541
|
|
457
542
|
hocr.search("//span[@class='ocr_line']").each do |line|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
543
|
+
lbbox = elementCoordinates( line,xscale,yscale )
|
544
|
+
next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
|
545
|
+
units = getOCRUnits( line,lbbox,fsize,charset,xscale,yscale )
|
546
|
+
next if units.length == 0
|
547
|
+
|
548
|
+
wwidth = 0
|
549
|
+
ltxt = ''
|
550
|
+
units.each do |unit|
|
551
|
+
ltxt << unit[0]
|
552
|
+
wwidth += ( unit[1][2] - unit[1][0] )
|
463
553
|
end
|
464
|
-
|
465
|
-
|
466
|
-
|
554
|
+
ratio = wwidth / @fdata.getLineWidth( ltxt,fsize )
|
555
|
+
pos = lbbox[0]
|
556
|
+
posdiff = 0
|
467
557
|
|
468
|
-
bbox = elementCoordinates( line,xscale,yscale )
|
469
|
-
ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
|
470
558
|
ret << sprintf( "%f %f %f %f %f %f Tm ",
|
471
|
-
ratio, 0, 0, ratio,
|
472
|
-
|
473
|
-
txt8 = ''
|
474
|
-
txt.each_char do |char|
|
475
|
-
begin
|
476
|
-
Iconv.iconv( "utf-16be","utf-8",char )
|
477
|
-
rescue
|
478
|
-
rawbytes = char.unpack( 'C*' )
|
479
|
-
bs = ''
|
480
|
-
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
481
|
-
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
482
|
-
char = '?' * rawbytes.length
|
483
|
-
end
|
559
|
+
ratio, 0, 0, ratio, lbbox[0], pheight - lbbox[3] - @fdata.header['Descent'] * fsize / 1000.0 * ratio)
|
560
|
+
in_txt = false
|
484
561
|
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
562
|
+
units.each_index do |i|
|
563
|
+
unit = units[i]
|
564
|
+
wtxt = unit[0]
|
565
|
+
bbox = unit[1]
|
566
|
+
|
567
|
+
posdiff = ( (pos - bbox[0]) * 1000 / fsize / ratio ).to_i if i > 0
|
568
|
+
pos = bbox[0] + ( @fdata.getLineWidth( wtxt,fsize ) * ratio )
|
569
|
+
|
570
|
+
txt8 = ''
|
571
|
+
wtxt.each_char do |char|
|
572
|
+
begin
|
573
|
+
Iconv.iconv( "utf-16be","utf-8",char )
|
574
|
+
rescue
|
575
|
+
rawbytes = char.unpack( 'C*' )
|
576
|
+
bs = ''
|
577
|
+
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
578
|
+
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
579
|
+
char = '?' * rawbytes.length
|
499
580
|
end
|
500
581
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
582
|
+
encoded = false
|
583
|
+
if cur_enc.nil? or not cur_enc.include? char
|
584
|
+
encodings.each_index do |i|
|
585
|
+
enc = encodings[i]
|
586
|
+
next if enc == cur_enc
|
587
|
+
|
588
|
+
if enc.include? char
|
589
|
+
if in_txt
|
590
|
+
ret << "#{posdiff} " if posdiff != 0
|
591
|
+
ret << "<#{txt8}> " unless txt8.eql? ''
|
592
|
+
ret << "] TJ "
|
593
|
+
end
|
594
|
+
cur_enc = enc
|
595
|
+
ret << "/Fnt#{i + 1} #{fsize} Tf "
|
596
|
+
txt8 = ''
|
597
|
+
posdiff = 0
|
598
|
+
encoded = true
|
599
|
+
in_txt = false
|
600
|
+
break
|
601
|
+
end
|
508
602
|
end
|
509
603
|
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
604
|
+
unless encoded
|
605
|
+
last = encodings[-1]
|
606
|
+
if last.length < 256
|
607
|
+
last << char
|
608
|
+
else
|
609
|
+
last = [ ' ',char ]
|
610
|
+
encodings << last
|
611
|
+
end
|
612
|
+
|
613
|
+
if cur_enc != last
|
614
|
+
if in_txt
|
615
|
+
ret << "#{posdiff} " if posdiff != 0
|
616
|
+
ret << "<#{txt8}> " unless txt8.eql? ''
|
617
|
+
ret << "] TJ "
|
618
|
+
end
|
619
|
+
cur_enc = last
|
620
|
+
ret << "/Fnt#{encodings.length} #{fsize} Tf "
|
621
|
+
txt8 = ''
|
622
|
+
posdiff = 0
|
623
|
+
in_txt = false
|
624
|
+
end
|
515
625
|
end
|
516
626
|
end
|
627
|
+
|
628
|
+
unless in_txt
|
629
|
+
ret << "[ "
|
630
|
+
in_txt = true
|
631
|
+
end
|
632
|
+
txt8 << sprintf( "%02X",cur_enc.index(char) )
|
517
633
|
end
|
518
634
|
|
519
|
-
txt8
|
635
|
+
unless txt8.eql? ''
|
636
|
+
ret << "#{posdiff} " if posdiff != 0
|
637
|
+
ret << "<#{txt8}> "
|
638
|
+
end
|
639
|
+
end
|
640
|
+
if in_txt
|
641
|
+
ret << "] TJ "
|
642
|
+
in_txt = false
|
520
643
|
end
|
521
|
-
|
522
|
-
ret << "<#{txt8}> Tj " unless txt8.eql? ''
|
523
644
|
end
|
524
645
|
|
525
646
|
ret << "ET "
|
data/lib/pdfbeads/pdffont.rb
CHANGED
@@ -266,7 +266,75 @@ class PDFBeads::PDFBuilder::FontDataProvider
|
|
266
266
|
0x02DB => ["/ogonek", 333],
|
267
267
|
0x02DC => ["/tilde", 333],
|
268
268
|
0x02DD => ["/hungarumlaut", 333],
|
269
|
+
0x0338 => ["/Alphatonos", 722],
|
270
|
+
0x0388 => ["/Epsilontonos", 694],
|
271
|
+
0x0389 => ["/Etatonos", 808],
|
272
|
+
0x038A => ["/Iotatonos", 412],
|
273
|
+
0x038C => ["/Omicrontonos", 722],
|
274
|
+
0x038E => ["/Upsilontonos", 816],
|
275
|
+
0x038F => ["/Omegatonos", 744],
|
276
|
+
0x03AC => ["/alphatonos", 522],
|
277
|
+
0x03AD => ["/epsilontonos", 420],
|
278
|
+
0x03AE => ["/etatonos", 522],
|
279
|
+
0x03AF => ["/iotatonos", 268],
|
280
|
+
0x0390 => ["/iotadieresistonos", 268],
|
281
|
+
0x0391 => ["/Alpha", 722],
|
282
|
+
0x0392 => ["/Beta", 667],
|
283
|
+
0x0393 => ["/Gamma", 578],
|
269
284
|
0x0394 => ["/Delta", 643],
|
285
|
+
0x0395 => ["/Epsilon", 611],
|
286
|
+
0x0396 => ["/Zeta", 611],
|
287
|
+
0x0397 => ["/Eta", 722],
|
288
|
+
0x0398 => ["/Theta", 722],
|
289
|
+
0x0399 => ["/Iota", 333],
|
290
|
+
0x039A => ["/Kappa", 722],
|
291
|
+
0x039B => ["/Lambda", 724],
|
292
|
+
0x039C => ["/Mu", 889],
|
293
|
+
0x039D => ["/Nu", 722],
|
294
|
+
0x039E => ["/Xi", 643],
|
295
|
+
0x039F => ["/Omicron", 722],
|
296
|
+
0x03A0 => ["/Pi", 722],
|
297
|
+
0x03A1 => ["/Rho", 556],
|
298
|
+
0x03A3 => ["/Sigma", 582],
|
299
|
+
0x03A4 => ["/Tau", 611],
|
300
|
+
0x03A5 => ["/Upsilon", 722],
|
301
|
+
0x03A6 => ["/Phi", 730],
|
302
|
+
0x03A7 => ["/Chi", 722],
|
303
|
+
0x03A8 => ["/Psi", 737],
|
304
|
+
0x03A9 => ["/Omega", 744],
|
305
|
+
0x03AA => ["/Iotadieresis", 333],
|
306
|
+
0x03AB => ["/Upsilondieresis", 722],
|
307
|
+
0x03B0 => ["/upsilondieresistonos", 496],
|
308
|
+
0x03B1 => ["/alpha", 522],
|
309
|
+
0x03B2 => ["/beta", 508],
|
310
|
+
0x03B3 => ["/gamma", 440],
|
311
|
+
0x03B4 => ["/delta", 471],
|
312
|
+
0x03B5 => ["/epsilon", 420],
|
313
|
+
0x03B6 => ["/zeta", 414],
|
314
|
+
0x03B7 => ["/eta", 522],
|
315
|
+
0x03B8 => ["/theta", 480],
|
316
|
+
0x03B9 => ["/iota", 268],
|
317
|
+
0x03BA => ["/kappa", 502],
|
318
|
+
0x03BB => ["/lambda", 484],
|
319
|
+
0x03BC => ["/mu", 500],
|
320
|
+
0x03BD => ["/nu", 452],
|
321
|
+
0x03BE => ["/xi", 444],
|
322
|
+
0x03BF => ["/omicron", 500],
|
323
|
+
0x03C0 => ["/pi", 504],
|
324
|
+
0x03C1 => ["/rho", 500],
|
325
|
+
0x03C2 => ["/sigma1", 396],
|
326
|
+
0x03C3 => ["/sigma", 540],
|
327
|
+
0x03C4 => ["/tau", 400],
|
328
|
+
0x03C5 => ["/upsilon", 496],
|
329
|
+
0x03C6 => ["/phi", 578],
|
330
|
+
0x03C7 => ["/chi", 444],
|
331
|
+
0x03C8 => ["/psi", 624],
|
332
|
+
0x03C9 => ["/omega", 658],
|
333
|
+
0x03CA => ["/iotadieresis", 268],
|
334
|
+
0x03CB => ["/upsilondieresis", 496],
|
335
|
+
0x03CC => ["/omicrontonos", 500],
|
336
|
+
0x03CD => ["/upsilontonos", 496],
|
337
|
+
0x03CE => ["/omegatonos", 658],
|
270
338
|
0x0401 => ["/afii10023", 611],
|
271
339
|
0x0402 => ["/afii10051", 752],
|
272
340
|
0x0403 => ["/afii10052", 578],
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfbeads
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 7
|
10
|
+
version: 1.0.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Alexey Kryukov
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-02-
|
18
|
+
date: 2012-02-10 00:00:00 +04:00
|
19
19
|
default_executable: pdfbeads
|
20
20
|
dependencies: []
|
21
21
|
|