pdfbeads 1.0.7 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/COPYING +0 -0
- data/ChangeLog +59 -0
- data/README +0 -0
- data/bin/pdfbeads +33 -4
- data/doc/pdfbeads.en.html +548 -0
- data/doc/pdfbeads.ru.html +74 -34
- data/lib/imageinspector.rb +24 -21
- data/lib/pdfbeads/pdfbuilder.rb +308 -87
- data/lib/pdfbeads/pdfdoc.rb +0 -0
- data/lib/pdfbeads/pdffont.rb +0 -0
- data/lib/pdfbeads/pdflabels.rb +0 -0
- data/lib/pdfbeads/pdfpage.rb +45 -32
- data/lib/pdfbeads/pdftoc.rb +7 -3
- data/lib/pdfbeads.rb +18 -7
- metadata +92 -61
data/lib/pdfbeads/pdfbuilder.rb
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
# Unlike other PDF creation tools, this utility attempts to implement
|
10
10
|
# the approach typically used for DjVu books. Its key feature is
|
11
11
|
# separating scanned text (typically black, but indexed images with
|
12
|
-
# a small number of colors are also accepted) from halftone images
|
12
|
+
# a small number of colors are also accepted) from halftone images
|
13
13
|
# placed into a background layer.
|
14
14
|
#
|
15
15
|
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
@@ -69,6 +69,7 @@ class PDFBeads::PDFBuilder
|
|
69
69
|
labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
|
70
70
|
toc = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
|
71
71
|
meta = parseMeta( @pdfargs[:meta] )
|
72
|
+
reader = getPDFReader( @pdfargs[:textpdf] )
|
72
73
|
|
73
74
|
cat = XObj.new(Hash[
|
74
75
|
'Type' => '/Catalog',
|
@@ -98,12 +99,12 @@ class PDFBeads::PDFBuilder
|
|
98
99
|
info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
|
99
100
|
end
|
100
101
|
|
101
|
-
|
102
|
-
|
103
|
-
'
|
104
|
-
|
105
|
-
|
106
|
-
|
102
|
+
if ( toc != nil and toc.length > 0 ) or @pdfargs[:rtl]
|
103
|
+
vpref = XObj.new(Hash.new())
|
104
|
+
vpref.addToDict('Direction', "/R2L") if @pdfargs[:rtl]
|
105
|
+
@doc.addObject(vpref)
|
106
|
+
cat.addToDict('ViewerPreferences', ref(vpref.getID))
|
107
|
+
end
|
107
108
|
|
108
109
|
pages = XObj.new(Hash[
|
109
110
|
'Type' => '/Pages'
|
@@ -132,8 +133,8 @@ class PDFBeads::PDFBuilder
|
|
132
133
|
'Intent' => '[/View/Design]'
|
133
134
|
})
|
134
135
|
@doc.addObject(ocBack)
|
135
|
-
cat.addToDict('OCProperties',
|
136
|
-
sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState
|
136
|
+
cat.addToDict('OCProperties',
|
137
|
+
sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState /ON /Order[%s %s] >>>>",
|
137
138
|
ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))
|
138
139
|
|
139
140
|
page_objs = Array.new()
|
@@ -150,10 +151,19 @@ class PDFBeads::PDFBuilder
|
|
150
151
|
begin
|
151
152
|
# If possible, use iso8859-1 (aka PDFDocEncoding) for page labels:
|
152
153
|
# it is at least guaranteed to be safe
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
154
|
+
if rng[:prefix].respond_to? :encode
|
155
|
+
ltitl = rng[:prefix].encode( "iso8859-1", "utf-8" )
|
156
|
+
else
|
157
|
+
ltitl = Iconv.iconv( "iso8859-1", "utf-8", rng[:prefix] ).first
|
158
|
+
end
|
159
|
+
nTree << "/P (#{ltitl.to_text}) "
|
160
|
+
# Iconv::InvalidCharacter, Iconv::IllegalSequence, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
161
|
+
rescue
|
162
|
+
if rng[:prefix].respond_to? :encode
|
163
|
+
ltitl = rng[:prefix].encode( "utf-16be", "utf-8" )
|
164
|
+
else
|
165
|
+
ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first
|
166
|
+
end
|
157
167
|
# If there is no number (just prefix) then put a zero character after the prefix:
|
158
168
|
# this makes acroread happy, but prevents displaying the number in evince
|
159
169
|
unless rng.has_key? :style
|
@@ -176,27 +186,31 @@ class PDFBeads::PDFBuilder
|
|
176
186
|
|
177
187
|
needs_font = false
|
178
188
|
fonts = encodings = nil
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
189
|
+
unless reader.nil?
|
190
|
+
fdict = importPDFFonts( reader,@pdfargs[:textpdf] )
|
191
|
+
else
|
192
|
+
pagefiles.each do |p|
|
193
|
+
unless p.hocr_path.nil?
|
194
|
+
needs_font = true
|
195
|
+
break
|
196
|
+
end
|
183
197
|
end
|
184
|
-
end
|
185
198
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
199
|
+
if needs_font
|
200
|
+
fonts = Array.new()
|
201
|
+
encodings = [ [' '] ]
|
202
|
+
fdict = XObj.new( Hash[] )
|
203
|
+
@doc.addObject( fdict )
|
204
|
+
|
205
|
+
descr = XObj.new( Hash[
|
206
|
+
'Type' => '/FontDescriptor',
|
207
|
+
'BaseFont' => '/Times-Roman',
|
208
|
+
] )
|
209
|
+
@fdata.header.each_key do |key|
|
210
|
+
descr.addToDict( key,@fdata.header[key] )
|
211
|
+
end
|
212
|
+
@doc.addObject( descr )
|
198
213
|
end
|
199
|
-
@doc.addObject( descr )
|
200
214
|
end
|
201
215
|
|
202
216
|
pagefiles.each do |p|
|
@@ -261,17 +275,24 @@ class PDFBeads::PDFBuilder
|
|
261
275
|
doc_objs.concat( [contents, resobj, resources] )
|
262
276
|
|
263
277
|
hocr = nil
|
264
|
-
|
265
|
-
hocr = open( p.hocr_path ) { |f| Hpricot.parse( f ) }
|
278
|
+
if not reader.nil?
|
266
279
|
procSet << '/Text'
|
267
|
-
c_str << getPDFText(
|
280
|
+
c_str << getPDFText( reader,pidx,@pdfargs[:debug] )
|
281
|
+
elsif not p.hocr_path.nil?
|
282
|
+
hocr = open( p.hocr_path ) { |f| Nokogiri::HTML( f ) }
|
283
|
+
procSet << '/Text'
|
284
|
+
c_str << getHOCRText( hocr,pheight,72.0/xres,72.0/yres,encodings )
|
268
285
|
end
|
269
286
|
|
270
|
-
|
271
|
-
|
272
|
-
|
287
|
+
unless @pdfargs[:debug]
|
288
|
+
contents.reinit( Hash[
|
289
|
+
'Filter' => '/FlateDecode'
|
290
|
+
], Zlib::Deflate.deflate( c_str,9 ) )
|
291
|
+
else
|
292
|
+
contents.reinit( Hash[], c_str )
|
293
|
+
end
|
273
294
|
resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
|
274
|
-
resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil?
|
295
|
+
resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil? and reader.nil?
|
275
296
|
|
276
297
|
page = XObj.new(Hash[
|
277
298
|
'Type' => '/Page',
|
@@ -325,6 +346,18 @@ class PDFBeads::PDFBuilder
|
|
325
346
|
getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
|
326
347
|
cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
|
327
348
|
cat.addToDict('PageMode', "/UseOutlines")
|
349
|
+
vpref.addToDict('NonFullScreenPageMode', "/UseOutlines")
|
350
|
+
end
|
351
|
+
|
352
|
+
if @pdfargs[:delfiles]
|
353
|
+
pagefiles.each do |p|
|
354
|
+
$stderr.puts( "Cleaning up temporary files for #{p.name}" )
|
355
|
+
safe_delete( p.fg_layer ) if p.fg_created
|
356
|
+
safe_delete( p.bg_layer ) if p.bg_created
|
357
|
+
p.stencils.each do |s|
|
358
|
+
safe_delete( s[:path] ) if s[:created]
|
359
|
+
end
|
360
|
+
end
|
328
361
|
end
|
329
362
|
end
|
330
363
|
|
@@ -347,6 +380,15 @@ class PDFBeads::PDFBuilder
|
|
347
380
|
|
348
381
|
private
|
349
382
|
|
383
|
+
def safe_delete( path )
|
384
|
+
begin
|
385
|
+
File.delete( path )
|
386
|
+
$stderr.puts( " Deleted #{path}" )
|
387
|
+
rescue Exception => e
|
388
|
+
$stderr.puts( "Could not delete #{path}: #{e.message}" )
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
350
392
|
def parseMeta( path )
|
351
393
|
ret = Hash.new()
|
352
394
|
return ret if path.nil? or path.eql? ''
|
@@ -361,7 +403,17 @@ class PDFBeads::PDFBuilder
|
|
361
403
|
key = $1
|
362
404
|
if keys.include? key
|
363
405
|
begin
|
364
|
-
|
406
|
+
tmp_str = ''
|
407
|
+
if $2.respond_to? :encode
|
408
|
+
tmp_str = $2.encode( "utf-16be", "utf-8" )
|
409
|
+
else
|
410
|
+
tmp_str = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
|
411
|
+
end
|
412
|
+
# a parenthesis code in a formally correct utf-16 should nevertheless be escaped
|
413
|
+
ret[key] = tmp_str.to_binary
|
414
|
+
ret[key].gsub!( /\x5C/,"\x5C\x5C" )
|
415
|
+
ret[key].gsub!( /\x28/,"\x5C\x28" )
|
416
|
+
ret[key].gsub!( /\x29/,"\x5C\x29" )
|
365
417
|
rescue
|
366
418
|
$stderr.puts("Error: metadata should be specified in utf-8")
|
367
419
|
end
|
@@ -372,6 +424,171 @@ class PDFBeads::PDFBuilder
|
|
372
424
|
ret
|
373
425
|
end
|
374
426
|
|
427
|
+
def getPDFReader( path )
|
428
|
+
return nil if path.nil? or path.eql? ''
|
429
|
+
return nil unless File.file? path
|
430
|
+
|
431
|
+
PDF::Reader.new( path )
|
432
|
+
end
|
433
|
+
|
434
|
+
def encodePDFArray( in_a )
|
435
|
+
out_a = Array.new()
|
436
|
+
out_a << '['
|
437
|
+
in_a.each do |item|
|
438
|
+
if item.is_a? String
|
439
|
+
out_a << ( '(' << item.to_s << ')' )
|
440
|
+
elsif item.is_a? Symbol
|
441
|
+
out_a << ( '/' << item.to_s )
|
442
|
+
elsif item.is_a? Array
|
443
|
+
out_a << encodePDFArray( item )
|
444
|
+
else
|
445
|
+
out_a << item.to_s
|
446
|
+
end
|
447
|
+
end
|
448
|
+
out_a << ']'
|
449
|
+
out_a.join( ' ' )
|
450
|
+
end
|
451
|
+
|
452
|
+
def encodePDFObjEntry( inhash,outobj,label )
|
453
|
+
if inhash[label].is_a? String
|
454
|
+
outobj.addToDict( label,"(#{inhash[label]})" )
|
455
|
+
|
456
|
+
elsif inhash[label].is_a? Symbol
|
457
|
+
outobj.addToDict( label,"/#{inhash[label]}" )
|
458
|
+
|
459
|
+
elsif inhash[label].is_a? Integer
|
460
|
+
outobj.addToDict( label,"#{inhash[label]}" )
|
461
|
+
|
462
|
+
elsif inhash[label].is_a? Array
|
463
|
+
outobj.addToDict( label,encodePDFArray( inhash[label] ) )
|
464
|
+
|
465
|
+
elsif inhash[label].is_a? Hash
|
466
|
+
newobj = XObj.new( Hash.new() )
|
467
|
+
@doc.addObject( newobj )
|
468
|
+
outobj.addToDict( label,ref(newobj.getID) )
|
469
|
+
inhash[label].keys.each do |newlabel|
|
470
|
+
encodePDFObjEntry( inhash[label],newobj,newlabel )
|
471
|
+
end
|
472
|
+
|
473
|
+
elsif inhash[label].is_a? PDF::Reader::Stream
|
474
|
+
newobj = XObj.new( Hash.new(),inhash[label].data )
|
475
|
+
@doc.addObject( newobj )
|
476
|
+
outobj.addToDict( label,ref(newobj.getID) )
|
477
|
+
inhash[label].hash.keys.each do |newlabel|
|
478
|
+
encodePDFObjEntry( inhash[label].hash,newobj,newlabel ) unless newlabel.eql? :Length
|
479
|
+
end
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
def importPDFFont( label,font )
|
484
|
+
fontobj = XObj.new( Hash.new() )
|
485
|
+
fontobj.addToDict( 'Name',"/#{label}" ) unless label.nil?
|
486
|
+
@doc.addObject( fontobj )
|
487
|
+
|
488
|
+
if font.has_key? :DescendantFonts
|
489
|
+
dfonts = Array.new()
|
490
|
+
font[:DescendantFonts].each {|dfont| dfonts << importPDFFont( nil,dfont ) }
|
491
|
+
fontobj.addToDict( "DescendantFonts",'[ ' << dfonts.map{|dfont| ref(dfont.getID)}.join(' ') << ' ]' )
|
492
|
+
end
|
493
|
+
|
494
|
+
[ :BaseFont, :Type, :Subtype, :FirstChar, :LastChar, :Widths, :FontDescriptor,
|
495
|
+
:Encoding, :ToUnicode, :DW, :W, :CIDSystemInfo, :CIDToGIDMap ].each do |fontkey|
|
496
|
+
encodePDFObjEntry( font,fontobj,fontkey ) if font.has_key? fontkey
|
497
|
+
end
|
498
|
+
fontobj
|
499
|
+
end
|
500
|
+
|
501
|
+
def importPDFFonts( reader,path )
|
502
|
+
fonts = Hash.new()
|
503
|
+
reader.pages.each_index do |i|
|
504
|
+
$stderr.puts("Reading font data from #{path}: page #{i}\n")
|
505
|
+
page = reader.pages[i]
|
506
|
+
page.fonts.each do |label,font|
|
507
|
+
fonts[label] = page.objects.deref( font ) unless fonts.has_key? label
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
fdict = XObj.new( Hash[] )
|
512
|
+
@doc.addObject( fdict )
|
513
|
+
fonts.keys.sort_by {|sym| sym.to_s}.each do |label|
|
514
|
+
fontobj = importPDFFont( label,fonts[label] )
|
515
|
+
fdict.addToDict( label,ref(fontobj.getID) )
|
516
|
+
end
|
517
|
+
fdict
|
518
|
+
end
|
519
|
+
|
520
|
+
def getPDFText( reader,pidx,debug )
|
521
|
+
return "" unless reader.pages.length > pidx
|
522
|
+
|
523
|
+
page = reader.pages[pidx]
|
524
|
+
pcont = page.raw_content.to_binary()
|
525
|
+
cidx = 0
|
526
|
+
in_t = false
|
527
|
+
pstack = 0
|
528
|
+
prevc = "\0"
|
529
|
+
ch_start = -1
|
530
|
+
ret = ""
|
531
|
+
tr_val = debug ? 0 : 3
|
532
|
+
|
533
|
+
pcont.each_byte do |char|
|
534
|
+
if char.chr.eql? '('
|
535
|
+
ctx = pcont[0,cidx].match( /\\+$/ )
|
536
|
+
pstack += 1 if ( ctx.nil? or ctx[0].length % 2 == 0 )
|
537
|
+
elsif char.chr.eql? ')'
|
538
|
+
ctx = pcont[0,cidx].match( /\\+$/ )
|
539
|
+
pstack -= 1 if ( ctx.nil? or ctx[0].length % 2 == 0 )
|
540
|
+
end
|
541
|
+
|
542
|
+
unless pstack > 0
|
543
|
+
# Text state operators may occur outside text objects. We have to take care of this
|
544
|
+
if not in_t and prevc.eql? 'T'
|
545
|
+
case char.chr
|
546
|
+
when 'c'
|
547
|
+
if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
|
548
|
+
ret << " #{$1} Tc"
|
549
|
+
end
|
550
|
+
when 'w'
|
551
|
+
if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
|
552
|
+
ret << " #{$1} Tw"
|
553
|
+
end
|
554
|
+
when 'z'
|
555
|
+
if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
|
556
|
+
ret << " #{$1} Tz"
|
557
|
+
end
|
558
|
+
when 'L'
|
559
|
+
if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
|
560
|
+
ret << " #{$1} TL"
|
561
|
+
end
|
562
|
+
when 'f'
|
563
|
+
if pcont[0,cidx-1] =~ /\/([A-Za-z0-9]+)\s+([-+]?\d*\.?\d+)\s+$/
|
564
|
+
ret << " /#{$1} #{$2} Tf"
|
565
|
+
end
|
566
|
+
# Tr operators are ignored, since we always need either a hidden text (3 Tr)
|
567
|
+
# or (for debugging purposes) a visible text without special effects (0 Tr)
|
568
|
+
when 's'
|
569
|
+
if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
|
570
|
+
chunks << " #{$1} Ts"
|
571
|
+
end
|
572
|
+
end
|
573
|
+
elsif not in_t and ( prevc + char.chr ).eql? 'BT'
|
574
|
+
ch_start = cidx -1
|
575
|
+
in_t = true
|
576
|
+
elsif in_t and ( prevc + char.chr ).eql? 'ET'
|
577
|
+
chunk = pcont.slice( ch_start,cidx - ch_start + 1 )
|
578
|
+
chunk.gsub!( /\d{1}\s+Tr/,"#{tr_val} Tr" )
|
579
|
+
ret << "\n" << chunk
|
580
|
+
ch_start = -1
|
581
|
+
in_t = false
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
prevc = char.chr
|
586
|
+
cidx += 1
|
587
|
+
end
|
588
|
+
return "\nq #{tr_val} Tr" << ret << " Q" if ret.length > 0
|
589
|
+
return ""
|
590
|
+
end
|
591
|
+
|
375
592
|
def getOutlineObjs( toc,page_ids,fp_id )
|
376
593
|
root = toc[0]
|
377
594
|
root[:pdfobj] = XObj.new( Hash[
|
@@ -390,8 +607,9 @@ class PDFBeads::PDFBuilder
|
|
390
607
|
end
|
391
608
|
|
392
609
|
item_text = item[:title].to_binary
|
393
|
-
item_text.
|
394
|
-
item_text.
|
610
|
+
item_text.gsub!( /\x5C/,"\x5C\x5C" )
|
611
|
+
item_text.gsub!( /\x28/,"\x5C\x28" )
|
612
|
+
item_text.gsub!( /\x29/,"\x5C\x29" )
|
395
613
|
item[:pdfobj] = XObj.new(Hash[
|
396
614
|
'Title' => "(\xFE\xFF#{item_text.to_text})",
|
397
615
|
'Parent' => ref(item[:parent][:pdfobj].getID),
|
@@ -432,8 +650,8 @@ class PDFBeads::PDFBuilder
|
|
432
650
|
def elementCoordinates( element,xscale,yscale )
|
433
651
|
out = [0,0,0,0]
|
434
652
|
|
435
|
-
if element.attributes.
|
436
|
-
if /bbox((\s+\d+){4})/.match(element.attributes
|
653
|
+
if element.attributes.has_key? 'title'
|
654
|
+
if /bbox((\s+\d+){4})/.match(element.attributes['title'].content)
|
437
655
|
coords = $1.strip.split(/\s+/)
|
438
656
|
out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
|
439
657
|
(coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
|
@@ -442,39 +660,33 @@ class PDFBeads::PDFBuilder
|
|
442
660
|
return out
|
443
661
|
end
|
444
662
|
|
445
|
-
def elementText( elem
|
446
|
-
|
447
|
-
|
448
|
-
txt = elem.to_plain_text.strip
|
449
|
-
txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
|
450
|
-
rescue
|
451
|
-
end
|
452
|
-
|
453
|
-
txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
|
454
|
-
return txt
|
663
|
+
def elementText( elem )
|
664
|
+
# used to put some Iconv stuff here, but nokogiri performs this conversion itself
|
665
|
+
return elem.inner_text.strip
|
455
666
|
end
|
456
667
|
|
457
|
-
def getOCRUnits( ocr_line,lbbox,fsize,
|
668
|
+
def getOCRUnits( ocr_line,lbbox,fsize,xscale,yscale )
|
458
669
|
units = Array.new()
|
459
|
-
ocr_words = ocr_line.
|
670
|
+
ocr_words = ocr_line.xpath(".//span[@class='ocrx_word']")
|
460
671
|
ocr_chars = nil
|
461
|
-
ocr_chars = ocr_line.
|
672
|
+
ocr_chars = ocr_line.at_xpath(".//span[@class='ocr_cinfo']") if ocr_words.length == 0
|
462
673
|
|
463
674
|
# If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
|
464
675
|
# into individual words
|
465
676
|
if ocr_words.length > 0
|
466
677
|
ocr_words.each do |word|
|
467
678
|
bbox = elementCoordinates( word,xscale,yscale )
|
468
|
-
|
679
|
+
next if bbox == [0,0,0,0]
|
680
|
+
txt = elementText( word )
|
469
681
|
units << [txt,bbox]
|
470
682
|
end
|
471
683
|
|
472
|
-
# If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
|
684
|
+
# If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
|
473
685
|
# into individual characters and then combine them into words
|
474
|
-
elsif not ocr_chars.nil? and ocr_chars.attributes.
|
475
|
-
if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes
|
686
|
+
elsif not ocr_chars.nil? and ocr_chars.attributes.has_key? 'title'
|
687
|
+
if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes['title'].content )
|
476
688
|
coords = $1.strip.split(/\s+/)
|
477
|
-
ltxt = elementText( ocr_line
|
689
|
+
ltxt = elementText( ocr_line )
|
478
690
|
charcnt = 0
|
479
691
|
ltxt.each_char { |uc| charcnt += 1 }
|
480
692
|
|
@@ -499,10 +711,11 @@ class PDFBeads::PDFBuilder
|
|
499
711
|
if /^\s+$/.match( uc )
|
500
712
|
wtxt = ''
|
501
713
|
|
502
|
-
# A workaround for probable hpricot bug
|
503
|
-
# characters from inside a string
|
504
|
-
# a bounding box with negative values
|
505
|
-
# character here, even if not
|
714
|
+
# A workaround for probable hpricot bug (TODO: is Nokogiri affected?),
|
715
|
+
# which sometimes causes whitespace characters from inside a string
|
716
|
+
# to be stripped. So if we find a bounding box with negative values
|
717
|
+
# we assume there was a whitespace character here, even if not
|
718
|
+
# preserved in the string itself
|
506
719
|
else
|
507
720
|
wtxt = uc
|
508
721
|
i += 1
|
@@ -519,7 +732,7 @@ class PDFBeads::PDFBuilder
|
|
519
732
|
|
520
733
|
# If neither word nor character bounding boxes are available, then store the line as a whole
|
521
734
|
if units.length == 0
|
522
|
-
ltxt = elementText( ocr_line
|
735
|
+
ltxt = elementText( ocr_line )
|
523
736
|
units << [ltxt,lbbox] unless ltxt.eql? ''
|
524
737
|
end
|
525
738
|
|
@@ -527,22 +740,15 @@ class PDFBeads::PDFBuilder
|
|
527
740
|
return units
|
528
741
|
end
|
529
742
|
|
530
|
-
def
|
743
|
+
def getHOCRText( hocr,pheight,xscale,yscale,encodings )
|
531
744
|
fsize = 10
|
532
745
|
cur_enc = nil
|
533
746
|
ret = " BT 3 Tr "
|
534
747
|
|
535
|
-
|
536
|
-
hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
|
537
|
-
attrs = el.attributes.to_hash
|
538
|
-
charset = $1 if attrs.has_key? 'content' and
|
539
|
-
/\Atext\/html;charset=([A-Za-z0-9-]+)\Z/i.match( attrs['content'] )
|
540
|
-
end
|
541
|
-
|
542
|
-
hocr.search("//span[@class='ocr_line']").each do |line|
|
748
|
+
hocr.xpath("//span[@class='ocr_line']").each do |line|
|
543
749
|
lbbox = elementCoordinates( line,xscale,yscale )
|
544
750
|
next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
|
545
|
-
units = getOCRUnits( line,lbbox,fsize,
|
751
|
+
units = getOCRUnits( line,lbbox,fsize,xscale,yscale )
|
546
752
|
next if units.length == 0
|
547
753
|
|
548
754
|
wwidth = 0
|
@@ -551,7 +757,9 @@ class PDFBeads::PDFBuilder
|
|
551
757
|
ltxt << unit[0]
|
552
758
|
wwidth += ( unit[1][2] - unit[1][0] )
|
553
759
|
end
|
554
|
-
|
760
|
+
lw = @fdata.getLineWidth( ltxt,fsize )
|
761
|
+
ratio = 1
|
762
|
+
ratio = wwidth / lw unless lw == 0
|
555
763
|
pos = lbbox[0]
|
556
764
|
posdiff = 0
|
557
765
|
|
@@ -570,7 +778,11 @@ class PDFBeads::PDFBuilder
|
|
570
778
|
txt8 = ''
|
571
779
|
wtxt.each_char do |char|
|
572
780
|
begin
|
573
|
-
|
781
|
+
if char.respond_to? :encode
|
782
|
+
char.encode!( "utf-16be", "utf-8" )
|
783
|
+
else
|
784
|
+
Iconv.iconv( "utf-16be","utf-8",char )
|
785
|
+
end
|
574
786
|
rescue
|
575
787
|
rawbytes = char.unpack( 'C*' )
|
576
788
|
bs = ''
|
@@ -691,15 +903,18 @@ class PDFBeads::PDFBuilder
|
|
691
903
|
|
692
904
|
unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
|
693
905
|
img = ImageList.new( path )
|
694
|
-
imgdata = img.to_blob{
|
695
|
-
|
696
|
-
|
697
|
-
|
906
|
+
imgdata = img.to_blob { |imd|
|
907
|
+
imd.format = 'TIFF'
|
908
|
+
imd.define( 'TIFF','rows-per-strip',height )
|
909
|
+
imd.compression = Group4Compression
|
698
910
|
}
|
699
911
|
stencil = ImageInspector.new( StringIO.new(imgdata) )
|
700
912
|
img.destroy!
|
701
913
|
end
|
702
914
|
body = stencil.getRawData
|
915
|
+
photometric = 0
|
916
|
+
photometric = stencil.tags[0x106][0] if
|
917
|
+
stencil.format.eql? :TIFF and stencil.tags.has_key? 0x106
|
703
918
|
|
704
919
|
xobj = XObj.new(Hash[
|
705
920
|
'Type' => '/XObject',
|
@@ -711,8 +926,14 @@ class PDFBeads::PDFBuilder
|
|
711
926
|
'ColorSpace' => '/DeviceGray',
|
712
927
|
'BitsPerComponent' => '1',
|
713
928
|
'Filter' => '/CCITTFaxDecode',
|
714
|
-
'DecodeParms' => "<< /Columns #{width} /K -1 >>"
|
929
|
+
'DecodeParms' => "<< /Columns #{width} /K -1 >>",
|
715
930
|
], body)
|
931
|
+
if photometric == 1 then
|
932
|
+
# As ImageMask is always on, BlackIs1 actually doesn't work, while
|
933
|
+
# the Decode array does.
|
934
|
+
xobj.addToDict( 'BlackIs1', 'true' )
|
935
|
+
xobj.addToDict( 'Decode', '[1 0]' )
|
936
|
+
end
|
716
937
|
|
717
938
|
return [ xobj,width,height,xres,yres ]
|
718
939
|
end
|
@@ -764,10 +985,10 @@ class PDFBeads::PDFBuilder
|
|
764
985
|
( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))
|
765
986
|
|
766
987
|
img = ImageList.new( impath )
|
767
|
-
imgdata = img.to_blob{
|
768
|
-
|
769
|
-
|
770
|
-
|
988
|
+
imgdata = img.to_blob { |imd|
|
989
|
+
imd.format = 'PNG'
|
990
|
+
imd.quality = 95
|
991
|
+
imd.compression = ZipCompression
|
771
992
|
}
|
772
993
|
insp = ImageInspector.new( StringIO.new(imgdata) )
|
773
994
|
img.destroy!
|
data/lib/pdfbeads/pdfdoc.rb
CHANGED
File without changes
|
data/lib/pdfbeads/pdffont.rb
CHANGED
File without changes
|
data/lib/pdfbeads/pdflabels.rb
CHANGED
File without changes
|