pdfbeads 1.0.7 → 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,7 +9,7 @@
9
9
  # Unlike other PDF creation tools, this utility attempts to implement
10
10
  # the approach typically used for DjVu books. Its key feature is
11
11
  # separating scanned text (typically black, but indexed images with
12
- # a small number of colors are also accepted) from halftone images
12
+ # a small number of colors are also accepted) from halftone images
13
13
  # placed into a background layer.
14
14
  #
15
15
  # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
@@ -69,6 +69,7 @@ class PDFBeads::PDFBuilder
69
69
  labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
70
70
  toc = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
71
71
  meta = parseMeta( @pdfargs[:meta] )
72
+ reader = getPDFReader( @pdfargs[:textpdf] )
72
73
 
73
74
  cat = XObj.new(Hash[
74
75
  'Type' => '/Catalog',
@@ -98,12 +99,12 @@ class PDFBeads::PDFBuilder
98
99
  info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
99
100
  end
100
101
 
101
- out = XObj.new(Hash[
102
- 'Type' => '/Outlines',
103
- 'Count' => 0
104
- ])
105
- @doc.addObject(out)
106
- cat.addToDict('Outlines', ref(out.getID))
102
+ if ( toc != nil and toc.length > 0 ) or @pdfargs[:rtl]
103
+ vpref = XObj.new(Hash.new())
104
+ vpref.addToDict('Direction', "/R2L") if @pdfargs[:rtl]
105
+ @doc.addObject(vpref)
106
+ cat.addToDict('ViewerPreferences', ref(vpref.getID))
107
+ end
107
108
 
108
109
  pages = XObj.new(Hash[
109
110
  'Type' => '/Pages'
@@ -132,8 +133,8 @@ class PDFBeads::PDFBuilder
132
133
  'Intent' => '[/View/Design]'
133
134
  })
134
135
  @doc.addObject(ocBack)
135
- cat.addToDict('OCProperties',
136
- sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState (ON) /Order[%s %s] >>>>",
136
+ cat.addToDict('OCProperties',
137
+ sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState /ON /Order[%s %s] >>>>",
137
138
  ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))
138
139
 
139
140
  page_objs = Array.new()
@@ -150,10 +151,19 @@ class PDFBeads::PDFBuilder
150
151
  begin
151
152
  # If possible, use iso8859-1 (aka PDFDocEncoding) for page labels:
152
153
  # it is at least guaranteed to be safe
153
- ltitl = Iconv.iconv( "iso8859-1", "utf-8", rng[:prefix] ).first
154
- nTree << "/P (#{ltitl.to_text}) "
155
- rescue Iconv::InvalidCharacter, Iconv::IllegalSequence
156
- ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first
154
+ if rng[:prefix].respond_to? :encode
155
+ ltitl = rng[:prefix].encode( "iso8859-1", "utf-8" )
156
+ else
157
+ ltitl = Iconv.iconv( "iso8859-1", "utf-8", rng[:prefix] ).first
158
+ end
159
+ nTree << "/P (#{ltitl.to_text}) "
160
+ # Iconv::InvalidCharacter, Iconv::IllegalSequence, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
161
+ rescue
162
+ if rng[:prefix].respond_to? :encode
163
+ ltitl = rng[:prefix].encode( "utf-16be", "utf-8" )
164
+ else
165
+ ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first
166
+ end
157
167
  # If there is no number (just prefix) then put a zero character after the prefix:
158
168
  # this makes acroread happy, but prevents displaying the number in evince
159
169
  unless rng.has_key? :style
@@ -176,27 +186,31 @@ class PDFBeads::PDFBuilder
176
186
 
177
187
  needs_font = false
178
188
  fonts = encodings = nil
179
- pagefiles.each do |p|
180
- unless p.hocr_path.nil?
181
- needs_font = true
182
- break
189
+ unless reader.nil?
190
+ fdict = importPDFFonts( reader,@pdfargs[:textpdf] )
191
+ else
192
+ pagefiles.each do |p|
193
+ unless p.hocr_path.nil?
194
+ needs_font = true
195
+ break
196
+ end
183
197
  end
184
- end
185
198
 
186
- if needs_font
187
- fonts = Array.new()
188
- encodings = [ [' '] ]
189
- fdict = XObj.new( Hash[] )
190
- @doc.addObject( fdict )
191
-
192
- descr = XObj.new( Hash[
193
- 'Type' => '/FontDescriptor',
194
- 'BaseFont' => '/Times-Roman',
195
- ] )
196
- @fdata.header.each_key do |key|
197
- descr.addToDict( key,@fdata.header[key] )
199
+ if needs_font
200
+ fonts = Array.new()
201
+ encodings = [ [' '] ]
202
+ fdict = XObj.new( Hash[] )
203
+ @doc.addObject( fdict )
204
+
205
+ descr = XObj.new( Hash[
206
+ 'Type' => '/FontDescriptor',
207
+ 'BaseFont' => '/Times-Roman',
208
+ ] )
209
+ @fdata.header.each_key do |key|
210
+ descr.addToDict( key,@fdata.header[key] )
211
+ end
212
+ @doc.addObject( descr )
198
213
  end
199
- @doc.addObject( descr )
200
214
  end
201
215
 
202
216
  pagefiles.each do |p|
@@ -261,17 +275,24 @@ class PDFBeads::PDFBuilder
261
275
  doc_objs.concat( [contents, resobj, resources] )
262
276
 
263
277
  hocr = nil
264
- unless p.hocr_path.nil?
265
- hocr = open( p.hocr_path ) { |f| Hpricot.parse( f ) }
278
+ if not reader.nil?
266
279
  procSet << '/Text'
267
- c_str << getPDFText( hocr,pheight,72.0/xres,72.0/yres,encodings )
280
+ c_str << getPDFText( reader,pidx,@pdfargs[:debug] )
281
+ elsif not p.hocr_path.nil?
282
+ hocr = open( p.hocr_path ) { |f| Nokogiri::HTML( f ) }
283
+ procSet << '/Text'
284
+ c_str << getHOCRText( hocr,pheight,72.0/xres,72.0/yres,encodings )
268
285
  end
269
286
 
270
- contents.reinit( Hash[
271
- 'Filter' => '/FlateDecode'
272
- ], Zlib::Deflate.deflate( c_str,9 ) )
287
+ unless @pdfargs[:debug]
288
+ contents.reinit( Hash[
289
+ 'Filter' => '/FlateDecode'
290
+ ], Zlib::Deflate.deflate( c_str,9 ) )
291
+ else
292
+ contents.reinit( Hash[], c_str )
293
+ end
273
294
  resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
274
- resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil?
295
+ resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil? and reader.nil?
275
296
 
276
297
  page = XObj.new(Hash[
277
298
  'Type' => '/Page',
@@ -325,6 +346,18 @@ class PDFBeads::PDFBuilder
325
346
  getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
326
347
  cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
327
348
  cat.addToDict('PageMode', "/UseOutlines")
349
+ vpref.addToDict('NonFullScreenPageMode', "/UseOutlines")
350
+ end
351
+
352
+ if @pdfargs[:delfiles]
353
+ pagefiles.each do |p|
354
+ $stderr.puts( "Cleaning up temporary files for #{p.name}" )
355
+ safe_delete( p.fg_layer ) if p.fg_created
356
+ safe_delete( p.bg_layer ) if p.bg_created
357
+ p.stencils.each do |s|
358
+ safe_delete( s[:path] ) if s[:created]
359
+ end
360
+ end
328
361
  end
329
362
  end
330
363
 
@@ -347,6 +380,15 @@ class PDFBeads::PDFBuilder
347
380
 
348
381
  private
349
382
 
383
+ def safe_delete( path )
384
+ begin
385
+ File.delete( path )
386
+ $stderr.puts( " Deleted #{path}" )
387
+ rescue Exception => e
388
+ $stderr.puts( "Could not delete #{path}: #{e.message}" )
389
+ end
390
+ end
391
+
350
392
  def parseMeta( path )
351
393
  ret = Hash.new()
352
394
  return ret if path.nil? or path.eql? ''
@@ -361,7 +403,17 @@ class PDFBeads::PDFBuilder
361
403
  key = $1
362
404
  if keys.include? key
363
405
  begin
364
- ret[key] = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
406
+ tmp_str = ''
407
+ if $2.respond_to? :encode
408
+ tmp_str = $2.encode( "utf-16be", "utf-8" )
409
+ else
410
+ tmp_str = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
411
+ end
412
+ # a parenthesis code in a formally correct utf-16 should nevertheless be escaped
413
+ ret[key] = tmp_str.to_binary
414
+ ret[key].gsub!( /\x5C/,"\x5C\x5C" )
415
+ ret[key].gsub!( /\x28/,"\x5C\x28" )
416
+ ret[key].gsub!( /\x29/,"\x5C\x29" )
365
417
  rescue
366
418
  $stderr.puts("Error: metadata should be specified in utf-8")
367
419
  end
@@ -372,6 +424,171 @@ class PDFBeads::PDFBuilder
372
424
  ret
373
425
  end
374
426
 
427
+ def getPDFReader( path )
428
+ return nil if path.nil? or path.eql? ''
429
+ return nil unless File.file? path
430
+
431
+ PDF::Reader.new( path )
432
+ end
433
+
434
+ def encodePDFArray( in_a )
435
+ out_a = Array.new()
436
+ out_a << '['
437
+ in_a.each do |item|
438
+ if item.is_a? String
439
+ out_a << ( '(' << item.to_s << ')' )
440
+ elsif item.is_a? Symbol
441
+ out_a << ( '/' << item.to_s )
442
+ elsif item.is_a? Array
443
+ out_a << encodePDFArray( item )
444
+ else
445
+ out_a << item.to_s
446
+ end
447
+ end
448
+ out_a << ']'
449
+ out_a.join( ' ' )
450
+ end
451
+
452
+ def encodePDFObjEntry( inhash,outobj,label )
453
+ if inhash[label].is_a? String
454
+ outobj.addToDict( label,"(#{inhash[label]})" )
455
+
456
+ elsif inhash[label].is_a? Symbol
457
+ outobj.addToDict( label,"/#{inhash[label]}" )
458
+
459
+ elsif inhash[label].is_a? Integer
460
+ outobj.addToDict( label,"#{inhash[label]}" )
461
+
462
+ elsif inhash[label].is_a? Array
463
+ outobj.addToDict( label,encodePDFArray( inhash[label] ) )
464
+
465
+ elsif inhash[label].is_a? Hash
466
+ newobj = XObj.new( Hash.new() )
467
+ @doc.addObject( newobj )
468
+ outobj.addToDict( label,ref(newobj.getID) )
469
+ inhash[label].keys.each do |newlabel|
470
+ encodePDFObjEntry( inhash[label],newobj,newlabel )
471
+ end
472
+
473
+ elsif inhash[label].is_a? PDF::Reader::Stream
474
+ newobj = XObj.new( Hash.new(),inhash[label].data )
475
+ @doc.addObject( newobj )
476
+ outobj.addToDict( label,ref(newobj.getID) )
477
+ inhash[label].hash.keys.each do |newlabel|
478
+ encodePDFObjEntry( inhash[label].hash,newobj,newlabel ) unless newlabel.eql? :Length
479
+ end
480
+ end
481
+ end
482
+
483
+ def importPDFFont( label,font )
484
+ fontobj = XObj.new( Hash.new() )
485
+ fontobj.addToDict( 'Name',"/#{label}" ) unless label.nil?
486
+ @doc.addObject( fontobj )
487
+
488
+ if font.has_key? :DescendantFonts
489
+ dfonts = Array.new()
490
+ font[:DescendantFonts].each {|dfont| dfonts << importPDFFont( nil,dfont ) }
491
+ fontobj.addToDict( "DescendantFonts",'[ ' << dfonts.map{|dfont| ref(dfont.getID)}.join(' ') << ' ]' )
492
+ end
493
+
494
+ [ :BaseFont, :Type, :Subtype, :FirstChar, :LastChar, :Widths, :FontDescriptor,
495
+ :Encoding, :ToUnicode, :DW, :W, :CIDSystemInfo, :CIDToGIDMap ].each do |fontkey|
496
+ encodePDFObjEntry( font,fontobj,fontkey ) if font.has_key? fontkey
497
+ end
498
+ fontobj
499
+ end
500
+
501
+ def importPDFFonts( reader,path )
502
+ fonts = Hash.new()
503
+ reader.pages.each_index do |i|
504
+ $stderr.puts("Reading font data from #{path}: page #{i}\n")
505
+ page = reader.pages[i]
506
+ page.fonts.each do |label,font|
507
+ fonts[label] = page.objects.deref( font ) unless fonts.has_key? label
508
+ end
509
+ end
510
+
511
+ fdict = XObj.new( Hash[] )
512
+ @doc.addObject( fdict )
513
+ fonts.keys.sort_by {|sym| sym.to_s}.each do |label|
514
+ fontobj = importPDFFont( label,fonts[label] )
515
+ fdict.addToDict( label,ref(fontobj.getID) )
516
+ end
517
+ fdict
518
+ end
519
+
520
+ def getPDFText( reader,pidx,debug )
521
+ return "" unless reader.pages.length > pidx
522
+
523
+ page = reader.pages[pidx]
524
+ pcont = page.raw_content.to_binary()
525
+ cidx = 0
526
+ in_t = false
527
+ pstack = 0
528
+ prevc = "\0"
529
+ ch_start = -1
530
+ ret = ""
531
+ tr_val = debug ? 0 : 3
532
+
533
+ pcont.each_byte do |char|
534
+ if char.chr.eql? '('
535
+ ctx = pcont[0,cidx].match( /\\+$/ )
536
+ pstack += 1 if ( ctx.nil? or ctx[0].length % 2 == 0 )
537
+ elsif char.chr.eql? ')'
538
+ ctx = pcont[0,cidx].match( /\\+$/ )
539
+ pstack -= 1 if ( ctx.nil? or ctx[0].length % 2 == 0 )
540
+ end
541
+
542
+ unless pstack > 0
543
+ # Text state operators may occur outside text objects. We have to take care of this
544
+ if not in_t and prevc.eql? 'T'
545
+ case char.chr
546
+ when 'c'
547
+ if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
548
+ ret << " #{$1} Tc"
549
+ end
550
+ when 'w'
551
+ if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
552
+ ret << " #{$1} Tw"
553
+ end
554
+ when 'z'
555
+ if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
556
+ ret << " #{$1} Tz"
557
+ end
558
+ when 'L'
559
+ if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
560
+ ret << " #{$1} TL"
561
+ end
562
+ when 'f'
563
+ if pcont[0,cidx-1] =~ /\/([A-Za-z0-9]+)\s+([-+]?\d*\.?\d+)\s+$/
564
+ ret << " /#{$1} #{$2} Tf"
565
+ end
566
+ # Tr operators are ignored, since we always need either a hidden text (3 Tr)
567
+ # or (for debugging purposes) a visible text without special effects (0 Tr)
568
+ when 's'
569
+ if pcont[0,cidx-1] =~ /([-+]?\d*\.?\d+)\s+$/
570
+ chunks << " #{$1} Ts"
571
+ end
572
+ end
573
+ elsif not in_t and ( prevc + char.chr ).eql? 'BT'
574
+ ch_start = cidx -1
575
+ in_t = true
576
+ elsif in_t and ( prevc + char.chr ).eql? 'ET'
577
+ chunk = pcont.slice( ch_start,cidx - ch_start + 1 )
578
+ chunk.gsub!( /\d{1}\s+Tr/,"#{tr_val} Tr" )
579
+ ret << "\n" << chunk
580
+ ch_start = -1
581
+ in_t = false
582
+ end
583
+ end
584
+
585
+ prevc = char.chr
586
+ cidx += 1
587
+ end
588
+ return "\nq #{tr_val} Tr" << ret << " Q" if ret.length > 0
589
+ return ""
590
+ end
591
+
375
592
  def getOutlineObjs( toc,page_ids,fp_id )
376
593
  root = toc[0]
377
594
  root[:pdfobj] = XObj.new( Hash[
@@ -390,8 +607,9 @@ class PDFBeads::PDFBuilder
390
607
  end
391
608
 
392
609
  item_text = item[:title].to_binary
393
- item_text.sub!( /\x28/,"\x5C\x28" )
394
- item_text.sub!( /\x29/,"\x5C\x29" )
610
+ item_text.gsub!( /\x5C/,"\x5C\x5C" )
611
+ item_text.gsub!( /\x28/,"\x5C\x28" )
612
+ item_text.gsub!( /\x29/,"\x5C\x29" )
395
613
  item[:pdfobj] = XObj.new(Hash[
396
614
  'Title' => "(\xFE\xFF#{item_text.to_text})",
397
615
  'Parent' => ref(item[:parent][:pdfobj].getID),
@@ -432,8 +650,8 @@ class PDFBeads::PDFBuilder
432
650
  def elementCoordinates( element,xscale,yscale )
433
651
  out = [0,0,0,0]
434
652
 
435
- if element.attributes.to_hash.has_key? 'title'
436
- if /bbox((\s+\d+){4})/.match(element.attributes.to_hash['title'])
653
+ if element.attributes.has_key? 'title'
654
+ if /bbox((\s+\d+){4})/.match(element.attributes['title'].content)
437
655
  coords = $1.strip.split(/\s+/)
438
656
  out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
439
657
  (coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
@@ -442,39 +660,33 @@ class PDFBeads::PDFBuilder
442
660
  return out
443
661
  end
444
662
 
445
- def elementText( elem,charset )
446
- txt = ''
447
- begin
448
- txt = elem.to_plain_text.strip
449
- txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
450
- rescue
451
- end
452
-
453
- txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
454
- return txt
663
+ def elementText( elem )
664
+ # used to put some Iconv stuff here, but nokogiri performs this conversion itself
665
+ return elem.inner_text.strip
455
666
  end
456
667
 
457
- def getOCRUnits( ocr_line,lbbox,fsize,charset,xscale,yscale )
668
+ def getOCRUnits( ocr_line,lbbox,fsize,xscale,yscale )
458
669
  units = Array.new()
459
- ocr_words = ocr_line.search("//span[@class='ocrx_word']")
670
+ ocr_words = ocr_line.xpath(".//span[@class='ocrx_word']")
460
671
  ocr_chars = nil
461
- ocr_chars = ocr_line.at("//span[@class='ocr_cinfo']") if ocr_words.length == 0
672
+ ocr_chars = ocr_line.at_xpath(".//span[@class='ocr_cinfo']") if ocr_words.length == 0
462
673
 
463
674
  # If 'ocrx_word' elements are available (as in Tesseract owtput), split the line
464
675
  # into individual words
465
676
  if ocr_words.length > 0
466
677
  ocr_words.each do |word|
467
678
  bbox = elementCoordinates( word,xscale,yscale )
468
- txt = elementText( word,charset )
679
+ next if bbox == [0,0,0,0]
680
+ txt = elementText( word )
469
681
  units << [txt,bbox]
470
682
  end
471
683
 
472
- # If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
684
+ # If 'ocrx_cinfo' data is available (as in Cuneiform) owtput, then split it
473
685
  # into individual characters and then combine them into words
474
- elsif not ocr_chars.nil? and ocr_chars.attributes.to_hash.has_key? 'title'
475
- if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes.to_hash['title'] )
686
+ elsif not ocr_chars.nil? and ocr_chars.attributes.has_key? 'title'
687
+ if /x_bboxes([-\s\d]+)/.match( ocr_chars.attributes['title'].content )
476
688
  coords = $1.strip.split(/\s+/)
477
- ltxt = elementText( ocr_line,charset )
689
+ ltxt = elementText( ocr_line )
478
690
  charcnt = 0
479
691
  ltxt.each_char { |uc| charcnt += 1 }
480
692
 
@@ -499,10 +711,11 @@ class PDFBeads::PDFBuilder
499
711
  if /^\s+$/.match( uc )
500
712
  wtxt = ''
501
713
 
502
- # A workaround for probable hpricot bug, which sometimes causes whitespace
503
- # characters from inside a string to be stripped. So if we find
504
- # a bounding box with negative values we assume there was a whitespace
505
- # character here, even if not preserved in the string itself
714
+ # A workaround for probable hpricot bug (TODO: is Nokogiri affected?),
715
+ # which sometimes causes whitespace characters from inside a string
716
+ # to be stripped. So if we find a bounding box with negative values
717
+ # we assume there was a whitespace character here, even if not
718
+ # preserved in the string itself
506
719
  else
507
720
  wtxt = uc
508
721
  i += 1
@@ -519,7 +732,7 @@ class PDFBeads::PDFBuilder
519
732
 
520
733
  # If neither word nor character bounding boxes are available, then store the line as a whole
521
734
  if units.length == 0
522
- ltxt = elementText( ocr_line,charset )
735
+ ltxt = elementText( ocr_line )
523
736
  units << [ltxt,lbbox] unless ltxt.eql? ''
524
737
  end
525
738
 
@@ -527,22 +740,15 @@ class PDFBeads::PDFBuilder
527
740
  return units
528
741
  end
529
742
 
530
- def getPDFText( hocr,pheight,xscale,yscale,encodings )
743
+ def getHOCRText( hocr,pheight,xscale,yscale,encodings )
531
744
  fsize = 10
532
745
  cur_enc = nil
533
746
  ret = " BT 3 Tr "
534
747
 
535
- charset = 'utf-8'
536
- hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
537
- attrs = el.attributes.to_hash
538
- charset = $1 if attrs.has_key? 'content' and
539
- /\Atext\/html;charset=([A-Za-z0-9-]+)\Z/i.match( attrs['content'] )
540
- end
541
-
542
- hocr.search("//span[@class='ocr_line']").each do |line|
748
+ hocr.xpath("//span[@class='ocr_line']").each do |line|
543
749
  lbbox = elementCoordinates( line,xscale,yscale )
544
750
  next if lbbox[2] - lbbox[0] <= 0 or lbbox[3] - lbbox[1] <= 0
545
- units = getOCRUnits( line,lbbox,fsize,charset,xscale,yscale )
751
+ units = getOCRUnits( line,lbbox,fsize,xscale,yscale )
546
752
  next if units.length == 0
547
753
 
548
754
  wwidth = 0
@@ -551,7 +757,9 @@ class PDFBeads::PDFBuilder
551
757
  ltxt << unit[0]
552
758
  wwidth += ( unit[1][2] - unit[1][0] )
553
759
  end
554
- ratio = wwidth / @fdata.getLineWidth( ltxt,fsize )
760
+ lw = @fdata.getLineWidth( ltxt,fsize )
761
+ ratio = 1
762
+ ratio = wwidth / lw unless lw == 0
555
763
  pos = lbbox[0]
556
764
  posdiff = 0
557
765
 
@@ -570,7 +778,11 @@ class PDFBeads::PDFBuilder
570
778
  txt8 = ''
571
779
  wtxt.each_char do |char|
572
780
  begin
573
- Iconv.iconv( "utf-16be","utf-8",char )
781
+ if char.respond_to? :encode
782
+ char.encode!( "utf-16be", "utf-8" )
783
+ else
784
+ Iconv.iconv( "utf-16be","utf-8",char )
785
+ end
574
786
  rescue
575
787
  rawbytes = char.unpack( 'C*' )
576
788
  bs = ''
@@ -691,15 +903,18 @@ class PDFBeads::PDFBuilder
691
903
 
692
904
  unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
693
905
  img = ImageList.new( path )
694
- imgdata = img.to_blob{
695
- self.format = 'TIFF'
696
- self.define( 'TIFF','rows-per-strip',height )
697
- self.compression = Group4Compression
906
+ imgdata = img.to_blob { |imd|
907
+ imd.format = 'TIFF'
908
+ imd.define( 'TIFF','rows-per-strip',height )
909
+ imd.compression = Group4Compression
698
910
  }
699
911
  stencil = ImageInspector.new( StringIO.new(imgdata) )
700
912
  img.destroy!
701
913
  end
702
914
  body = stencil.getRawData
915
+ photometric = 0
916
+ photometric = stencil.tags[0x106][0] if
917
+ stencil.format.eql? :TIFF and stencil.tags.has_key? 0x106
703
918
 
704
919
  xobj = XObj.new(Hash[
705
920
  'Type' => '/XObject',
@@ -711,8 +926,14 @@ class PDFBeads::PDFBuilder
711
926
  'ColorSpace' => '/DeviceGray',
712
927
  'BitsPerComponent' => '1',
713
928
  'Filter' => '/CCITTFaxDecode',
714
- 'DecodeParms' => "<< /Columns #{width} /K -1 >>"
929
+ 'DecodeParms' => "<< /Columns #{width} /K -1 >>",
715
930
  ], body)
931
+ if photometric == 1 then
932
+ # As ImageMask is always on, BlackIs1 actually doesn't work, while
933
+ # the Decode array does.
934
+ xobj.addToDict( 'BlackIs1', 'true' )
935
+ xobj.addToDict( 'Decode', '[1 0]' )
936
+ end
716
937
 
717
938
  return [ xobj,width,height,xres,yres ]
718
939
  end
@@ -764,10 +985,10 @@ class PDFBeads::PDFBuilder
764
985
  ( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))
765
986
 
766
987
  img = ImageList.new( impath )
767
- imgdata = img.to_blob{
768
- self.format = 'PNG'
769
- self.quality = 95
770
- self.compression = ZipCompression
988
+ imgdata = img.to_blob { |imd|
989
+ imd.format = 'PNG'
990
+ imd.quality = 95
991
+ imd.compression = ZipCompression
771
992
  }
772
993
  insp = ImageInspector.new( StringIO.new(imgdata) )
773
994
  img.destroy!
File without changes
File without changes
File without changes