pdfbeads 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +14 -1
- data/bin/pdfbeads +1 -1
- data/lib/pdfbeads/pdfbuilder.rb +178 -57
- data/lib/pdfbeads/pdffont.rb +68 -0
- metadata +4 -4
data/ChangeLog
CHANGED
@@ -25,4 +25,17 @@
|
|
25
25
|
JPEG files were still written with the 'JP2' extension.
|
26
26
|
|
27
27
|
* Some tweaks to minimize the effect of page labels being inconsistently handled
|
28
|
-
in various PDF viewers (prefer ISO-8859-1 strings if possible).
|
28
|
+
in various PDF viewers (prefer ISO-8859-1 strings if possible).
|
29
|
+
|
30
|
+
2012 February 5 (Alexey Kryukov) Version 1.0.6
|
31
|
+
|
32
|
+
+ Add Greek letters (the monotonic set) to the list of characters with hardcoded
|
33
|
+
glyph names and width.
|
34
|
+
|
35
|
+
* Minor bugs fixed.
|
36
|
+
|
37
|
+
2012 February 10 (Alexey Kryukov) Version 1.0.7
|
38
|
+
|
39
|
+
+ An attempt to achive better positioning of the hidden text layer, taking into
|
40
|
+
account not just lines, but also individual words. This should work with hOCR
|
41
|
+
files produced with Cuneiform or Tesseract.
|
data/bin/pdfbeads
CHANGED
@@ -144,7 +144,7 @@ OptionParser.new() do |opts|
|
|
144
144
|
opts.on("-b", "--bg-compression FORMAT",
|
145
145
|
['JP2', 'JPX', 'J2K', 'JPEG2000', 'JPG', 'JPEG', 'LOSSLESS', 'PNG', 'DEFLATE'],
|
146
146
|
"Compression method for background images. Acceptable",
|
147
|
-
"values are JP2|JPX|JPEG2000, JPG|JPEG or LOSSLESS.",
|
147
|
+
"values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
|
148
148
|
"JP2 is used by default, unless this format is not",
|
149
149
|
"supported by the available version of ImageMagick" ) do |format|
|
150
150
|
case format.upcase
|
data/lib/pdfbeads/pdfbuilder.rb
CHANGED
@@ -357,7 +357,7 @@ class PDFBeads::PDFBuilder
|
|
357
357
|
fin.each do |fl|
|
358
358
|
next if /^\#/.match( fl )
|
359
359
|
|
360
|
-
if /^\/?([A-Za-z]+)[
|
360
|
+
if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
|
361
361
|
key = $1
|
362
362
|
if keys.include? key
|
363
363
|
begin
|
@@ -390,8 +390,8 @@ class PDFBeads::PDFBuilder
|
|
390
390
|
end
|
391
391
|
|
392
392
|
item_text = item[:title].to_binary
|
393
|
-
item_text.sub!( /\
|
394
|
-
item_text.sub!( /\
|
393
|
+
item_text.sub!( /\x28/,"\x5C\x28" )
|
394
|
+
item_text.sub!( /\x29/,"\x5C\x29" )
|
395
395
|
item[:pdfobj] = XObj.new(Hash[
|
396
396
|
'Title' => "(\xFE\xFF#{item_text.to_text})",
|
397
397
|
'Parent' => ref(item[:parent][:pdfobj].getID),
|
@@ -442,10 +442,95 @@ class PDFBeads::PDFBuilder
|
|
442
442
|
return out
|
443
443
|
end
|
444
444
|
|
445
|
+
def elementText( elem,charset )
|
446
|
+
txt = ''
|
447
|
+
begin
|
448
|
+
txt = elem.to_plain_text.strip
|
449
|
+
txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
|
450
|
+
rescue
|
451
|
+
end
|
452
|
+
|
453
|
+
txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
|
454
|
+
return txt
|
455
|
+
end
|
456
|
+
|
457
|
+
def getOCRUnits( ocr_line,lbbox,fsize,charset,xscale,yscale )
|
458
|
+
units = Array.new()
|
459
|
+
ocr_words = ocr_line.search("//span[@class='ocrx_word']")
|
460
|
+
ocr_chars = nil
|
461
|
+
ocr_chars = ocr_line.at("//span[@class='ocr_cinfo']") if ocr_words.length == 0
|
462
|
+
|
463
|
+
# If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
|
464
|
+
# into individual words
|
465
|
+
if ocr_words.length > 0
|
466
|
+
ocr_words.each do |word|
|
467
|
+
bbox = elementCoordinates( word,xscale,yscale )
|
468
|
+
txt = elementText( word,charset )
|
469
|
+
units << [txt,bbox]
|
470
|
+
end
|
471
|
+
|
472
|
+
# If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
|
473
|
+
# into individual characters and then combine them into words
|
474
|
+
elsif not ocr_chars.nil? and ocr_chars.attributes.to_hash.has_key? 'title'
|
475
|
+
if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes.to_hash['title'] )
|
476
|
+
coords = $1.strip.split(/\s+/)
|
477
|
+
ltxt = elementText( ocr_line,charset )
|
478
|
+
charcnt = 0
|
479
|
+
ltxt.each_char { |uc| charcnt += 1 }
|
480
|
+
|
481
|
+
if charcnt <= coords.length/4
|
482
|
+
i = 0
|
483
|
+
wtxt = ''
|
484
|
+
bbox = [-1,-1,-1,-1]
|
485
|
+
ltxt.each_char do |uc|
|
486
|
+
cbbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
|
487
|
+
(coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
|
488
|
+
|
489
|
+
unless cbbox[0] < 0
|
490
|
+
bbox[0] = cbbox[0] if cbbox[0] < bbox[0] or bbox[0] < 0
|
491
|
+
bbox[1] = cbbox[1] if cbbox[1] < bbox[1] or bbox[1] < 0
|
492
|
+
bbox[2] = cbbox[2] if cbbox[2] > bbox[2] or bbox[2] < 0
|
493
|
+
bbox[3] = cbbox[3] if cbbox[3] > bbox[3] or bbox[3] < 0
|
494
|
+
wtxt << uc
|
495
|
+
|
496
|
+
else
|
497
|
+
units << [wtxt,bbox]
|
498
|
+
bbox = [-1,-1,-1,-1]
|
499
|
+
if /^\s+$/.match( uc )
|
500
|
+
wtxt = ''
|
501
|
+
|
502
|
+
# A workaround for probable hpricot bug, which sometimes causes whitespace
|
503
|
+
# characters from inside a string to be stripped. So if we find
|
504
|
+
# a bounding box with negative values we assume there was a whitespace
|
505
|
+
# character here, even if not preserved in the string itself
|
506
|
+
else
|
507
|
+
wtxt = uc
|
508
|
+
i += 1
|
509
|
+
bbox = [ (coords[i*4].to_i*xscale).to_f,(coords[i*4+1].to_i*xscale).to_f,
|
510
|
+
(coords[i*4+2].to_i*yscale).to_f,(coords[i*4+3].to_i*yscale).to_f ]
|
511
|
+
end
|
512
|
+
end
|
513
|
+
i += 1
|
514
|
+
end
|
515
|
+
units << [wtxt,bbox] unless wtxt.eql? ''
|
516
|
+
end
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
# If neither word nor character bounding boxes are available, then store the line as a whole
|
521
|
+
if units.length == 0
|
522
|
+
ltxt = elementText( ocr_line,charset )
|
523
|
+
units << [ltxt,lbbox] unless ltxt.eql? ''
|
524
|
+
end
|
525
|
+
|
526
|
+
units[units.length-1][0].sub!( /-\Z/, "\xC2\xAD" ) unless units.length == 0
|
527
|
+
return units
|
528
|
+
end
|
529
|
+
|
445
530
|
def getPDFText( hocr,pheight,xscale,yscale,encodings )
|
446
531
|
fsize = 10
|
447
|
-
cur_enc =
|
448
|
-
ret = " BT 3 Tr
|
532
|
+
cur_enc = nil
|
533
|
+
ret = " BT 3 Tr "
|
449
534
|
|
450
535
|
charset = 'utf-8'
|
451
536
|
hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
|
@@ -455,71 +540,107 @@ class PDFBeads::PDFBuilder
|
|
455
540
|
end
|
456
541
|
|
457
542
|
hocr.search("//span[@class='ocr_line']").each do |line|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
543
|
+
lbbox = elementCoordinates( line,xscale,yscale )
|
544
|
+
next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
|
545
|
+
units = getOCRUnits( line,lbbox,fsize,charset,xscale,yscale )
|
546
|
+
next if units.length == 0
|
547
|
+
|
548
|
+
wwidth = 0
|
549
|
+
ltxt = ''
|
550
|
+
units.each do |unit|
|
551
|
+
ltxt << unit[0]
|
552
|
+
wwidth += ( unit[1][2] - unit[1][0] )
|
463
553
|
end
|
464
|
-
|
465
|
-
|
466
|
-
|
554
|
+
ratio = wwidth / @fdata.getLineWidth( ltxt,fsize )
|
555
|
+
pos = lbbox[0]
|
556
|
+
posdiff = 0
|
467
557
|
|
468
|
-
bbox = elementCoordinates( line,xscale,yscale )
|
469
|
-
ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
|
470
558
|
ret << sprintf( "%f %f %f %f %f %f Tm ",
|
471
|
-
ratio, 0, 0, ratio,
|
472
|
-
|
473
|
-
txt8 = ''
|
474
|
-
txt.each_char do |char|
|
475
|
-
begin
|
476
|
-
Iconv.iconv( "utf-16be","utf-8",char )
|
477
|
-
rescue
|
478
|
-
rawbytes = char.unpack( 'C*' )
|
479
|
-
bs = ''
|
480
|
-
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
481
|
-
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
482
|
-
char = '?' * rawbytes.length
|
483
|
-
end
|
559
|
+
ratio, 0, 0, ratio, lbbox[0], pheight - lbbox[3] - @fdata.header['Descent'] * fsize / 1000.0 * ratio)
|
560
|
+
in_txt = false
|
484
561
|
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
562
|
+
units.each_index do |i|
|
563
|
+
unit = units[i]
|
564
|
+
wtxt = unit[0]
|
565
|
+
bbox = unit[1]
|
566
|
+
|
567
|
+
posdiff = ( (pos - bbox[0]) * 1000 / fsize / ratio ).to_i if i > 0
|
568
|
+
pos = bbox[0] + ( @fdata.getLineWidth( wtxt,fsize ) * ratio )
|
569
|
+
|
570
|
+
txt8 = ''
|
571
|
+
wtxt.each_char do |char|
|
572
|
+
begin
|
573
|
+
Iconv.iconv( "utf-16be","utf-8",char )
|
574
|
+
rescue
|
575
|
+
rawbytes = char.unpack( 'C*' )
|
576
|
+
bs = ''
|
577
|
+
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
578
|
+
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
579
|
+
char = '?' * rawbytes.length
|
499
580
|
end
|
500
581
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
582
|
+
encoded = false
|
583
|
+
if cur_enc.nil? or not cur_enc.include? char
|
584
|
+
encodings.each_index do |i|
|
585
|
+
enc = encodings[i]
|
586
|
+
next if enc == cur_enc
|
587
|
+
|
588
|
+
if enc.include? char
|
589
|
+
if in_txt
|
590
|
+
ret << "#{posdiff} " if posdiff != 0
|
591
|
+
ret << "<#{txt8}> " unless txt8.eql? ''
|
592
|
+
ret << "] TJ "
|
593
|
+
end
|
594
|
+
cur_enc = enc
|
595
|
+
ret << "/Fnt#{i + 1} #{fsize} Tf "
|
596
|
+
txt8 = ''
|
597
|
+
posdiff = 0
|
598
|
+
encoded = true
|
599
|
+
in_txt = false
|
600
|
+
break
|
601
|
+
end
|
508
602
|
end
|
509
603
|
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
604
|
+
unless encoded
|
605
|
+
last = encodings[-1]
|
606
|
+
if last.length < 256
|
607
|
+
last << char
|
608
|
+
else
|
609
|
+
last = [ ' ',char ]
|
610
|
+
encodings << last
|
611
|
+
end
|
612
|
+
|
613
|
+
if cur_enc != last
|
614
|
+
if in_txt
|
615
|
+
ret << "#{posdiff} " if posdiff != 0
|
616
|
+
ret << "<#{txt8}> " unless txt8.eql? ''
|
617
|
+
ret << "] TJ "
|
618
|
+
end
|
619
|
+
cur_enc = last
|
620
|
+
ret << "/Fnt#{encodings.length} #{fsize} Tf "
|
621
|
+
txt8 = ''
|
622
|
+
posdiff = 0
|
623
|
+
in_txt = false
|
624
|
+
end
|
515
625
|
end
|
516
626
|
end
|
627
|
+
|
628
|
+
unless in_txt
|
629
|
+
ret << "[ "
|
630
|
+
in_txt = true
|
631
|
+
end
|
632
|
+
txt8 << sprintf( "%02X",cur_enc.index(char) )
|
517
633
|
end
|
518
634
|
|
519
|
-
txt8
|
635
|
+
unless txt8.eql? ''
|
636
|
+
ret << "#{posdiff} " if posdiff != 0
|
637
|
+
ret << "<#{txt8}> "
|
638
|
+
end
|
639
|
+
end
|
640
|
+
if in_txt
|
641
|
+
ret << "] TJ "
|
642
|
+
in_txt = false
|
520
643
|
end
|
521
|
-
|
522
|
-
ret << "<#{txt8}> Tj " unless txt8.eql? ''
|
523
644
|
end
|
524
645
|
|
525
646
|
ret << "ET "
|
data/lib/pdfbeads/pdffont.rb
CHANGED
@@ -266,7 +266,75 @@ class PDFBeads::PDFBuilder::FontDataProvider
|
|
266
266
|
0x02DB => ["/ogonek", 333],
|
267
267
|
0x02DC => ["/tilde", 333],
|
268
268
|
0x02DD => ["/hungarumlaut", 333],
|
269
|
+
0x0338 => ["/Alphatonos", 722],
|
270
|
+
0x0388 => ["/Epsilontonos", 694],
|
271
|
+
0x0389 => ["/Etatonos", 808],
|
272
|
+
0x038A => ["/Iotatonos", 412],
|
273
|
+
0x038C => ["/Omicrontonos", 722],
|
274
|
+
0x038E => ["/Upsilontonos", 816],
|
275
|
+
0x038F => ["/Omegatonos", 744],
|
276
|
+
0x03AC => ["/alphatonos", 522],
|
277
|
+
0x03AD => ["/epsilontonos", 420],
|
278
|
+
0x03AE => ["/etatonos", 522],
|
279
|
+
0x03AF => ["/iotatonos", 268],
|
280
|
+
0x0390 => ["/iotadieresistonos", 268],
|
281
|
+
0x0391 => ["/Alpha", 722],
|
282
|
+
0x0392 => ["/Beta", 667],
|
283
|
+
0x0393 => ["/Gamma", 578],
|
269
284
|
0x0394 => ["/Delta", 643],
|
285
|
+
0x0395 => ["/Epsilon", 611],
|
286
|
+
0x0396 => ["/Zeta", 611],
|
287
|
+
0x0397 => ["/Eta", 722],
|
288
|
+
0x0398 => ["/Theta", 722],
|
289
|
+
0x0399 => ["/Iota", 333],
|
290
|
+
0x039A => ["/Kappa", 722],
|
291
|
+
0x039B => ["/Lambda", 724],
|
292
|
+
0x039C => ["/Mu", 889],
|
293
|
+
0x039D => ["/Nu", 722],
|
294
|
+
0x039E => ["/Xi", 643],
|
295
|
+
0x039F => ["/Omicron", 722],
|
296
|
+
0x03A0 => ["/Pi", 722],
|
297
|
+
0x03A1 => ["/Rho", 556],
|
298
|
+
0x03A3 => ["/Sigma", 582],
|
299
|
+
0x03A4 => ["/Tau", 611],
|
300
|
+
0x03A5 => ["/Upsilon", 722],
|
301
|
+
0x03A6 => ["/Phi", 730],
|
302
|
+
0x03A7 => ["/Chi", 722],
|
303
|
+
0x03A8 => ["/Psi", 737],
|
304
|
+
0x03A9 => ["/Omega", 744],
|
305
|
+
0x03AA => ["/Iotadieresis", 333],
|
306
|
+
0x03AB => ["/Upsilondieresis", 722],
|
307
|
+
0x03B0 => ["/upsilondieresistonos", 496],
|
308
|
+
0x03B1 => ["/alpha", 522],
|
309
|
+
0x03B2 => ["/beta", 508],
|
310
|
+
0x03B3 => ["/gamma", 440],
|
311
|
+
0x03B4 => ["/delta", 471],
|
312
|
+
0x03B5 => ["/epsilon", 420],
|
313
|
+
0x03B6 => ["/zeta", 414],
|
314
|
+
0x03B7 => ["/eta", 522],
|
315
|
+
0x03B8 => ["/theta", 480],
|
316
|
+
0x03B9 => ["/iota", 268],
|
317
|
+
0x03BA => ["/kappa", 502],
|
318
|
+
0x03BB => ["/lambda", 484],
|
319
|
+
0x03BC => ["/mu", 500],
|
320
|
+
0x03BD => ["/nu", 452],
|
321
|
+
0x03BE => ["/xi", 444],
|
322
|
+
0x03BF => ["/omicron", 500],
|
323
|
+
0x03C0 => ["/pi", 504],
|
324
|
+
0x03C1 => ["/rho", 500],
|
325
|
+
0x03C2 => ["/sigma1", 396],
|
326
|
+
0x03C3 => ["/sigma", 540],
|
327
|
+
0x03C4 => ["/tau", 400],
|
328
|
+
0x03C5 => ["/upsilon", 496],
|
329
|
+
0x03C6 => ["/phi", 578],
|
330
|
+
0x03C7 => ["/chi", 444],
|
331
|
+
0x03C8 => ["/psi", 624],
|
332
|
+
0x03C9 => ["/omega", 658],
|
333
|
+
0x03CA => ["/iotadieresis", 268],
|
334
|
+
0x03CB => ["/upsilondieresis", 496],
|
335
|
+
0x03CC => ["/omicrontonos", 500],
|
336
|
+
0x03CD => ["/upsilontonos", 496],
|
337
|
+
0x03CE => ["/omegatonos", 658],
|
270
338
|
0x0401 => ["/afii10023", 611],
|
271
339
|
0x0402 => ["/afii10051", 752],
|
272
340
|
0x0403 => ["/afii10052", 578],
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdfbeads
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 7
|
10
|
+
version: 1.0.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Alexey Kryukov
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-02-
|
18
|
+
date: 2012-02-10 00:00:00 +04:00
|
19
19
|
default_executable: pdfbeads
|
20
20
|
dependencies: []
|
21
21
|
|