pdfbeads 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/pdfbeads.rb ADDED
@@ -0,0 +1,93 @@
1
+ # encoding: UTF-8
2
+
3
+ ######################################################################
4
+ #
5
+ # PDFBeads -- convert scanned images to a single PDF file
6
+ # Version 1.0
7
+ #
8
+ # Unlike other PDF creation tools, this utility attempts to implement
9
+ # the approach typically used for DjVu books. Its key feature is
10
+ # separating scanned text (typically black, but indexed images with
11
+ # a small number of colors are also accepted) from halftone images
12
+ # placed into a background layer.
13
+ #
14
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
15
+ # All rights reserved.
16
+ #
17
+ # This program is free software; you can redistribute it and/or modify
18
+ # it under the terms of the GNU General Public License as published by
19
+ # the Free Software Foundation; either version 2 of the License, or
20
+ # (at your option) any later version.
21
+ #
22
+ # This program is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
+ # GNU General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU General Public License
28
+ # along with this program; if not, write to the Free Software
29
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30
+ #
31
+ #######################################################################
32
+
33
+ require 'iconv'
34
+ require 'zlib'
35
+
36
+ require 'RMagick'
37
+ include Magick
38
+
39
+ begin
40
+ require 'hpricot'
41
+ $has_hpricot = true
42
+ rescue LoadError
43
+ $stderr.puts( "Warning: the hpricot extension is not available." )
44
+ $stderr.puts( " pdfbeads will not be able to read OCR data from hOCR files." )
45
+ $has_hpricot = false
46
+ end
47
+
48
+ unless ''.respond_to? :ord
49
+ $KCODE = 'u'
50
+ require 'jcode'
51
+ end
52
+
53
+ class String
54
+ # Protect strings which are supposed be treated as a raw sequence of bytes.
55
+ # This is important for Ruby 1.9. For earlier versions the method just
56
+ # does nothing.
57
+ unless self.method_defined? :to_binary
58
+ def to_binary()
59
+ force_encoding 'ASCII-8BIT' if respond_to? :force_encoding
60
+ return self
61
+ end
62
+ end
63
+
64
+ # In ruby 1.9 sometimes we have to mark a string as UTF-8 encoded
65
+ # even if we certainly know it is not.
66
+ unless self.method_defined? :to_text
67
+ def to_text()
68
+ force_encoding 'UTF-8' if respond_to? :force_encoding
69
+ return self
70
+ end
71
+ end
72
+
73
+ # Get a Unicode ordinal for an encoded character (there is no standard method
74
+ # in Ruby < 1.9 to do that)
75
+ unless self.method_defined? :ord
76
+ def ord()
77
+ begin
78
+ return Iconv.iconv( 'utf-16be','utf-8',self ).first.unpack('n')[0]
79
+ rescue
80
+ return 0x3F # Question mark
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ require 'imageinspector'
87
+
88
+ module PDFBeads
89
+ VERSION = '1.0'
90
+ require 'pdfbeads/pdfbuilder'
91
+ require 'pdfbeads/pdfpage'
92
+ end
93
+
@@ -0,0 +1,699 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ ######################################################################
5
+ #
6
+ # PDFBeads -- convert scanned images to a single PDF file
7
+ # Version 1.0
8
+ #
9
+ # Unlike other PDF creation tools, this utility attempts to implement
10
+ # the approach typically used for DjVu books. Its key feature is
11
+ # separating scanned text (typically black, but indexed images with
12
+ # a small number of colors are also accepted) from halftone images
13
+ # placed into a background layer.
14
+ #
15
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
16
+ # All rights reserved.
17
+ #
18
+ # This program is free software; you can redistribute it and/or modify
19
+ # it under the terms of the GNU General Public License as published by
20
+ # the Free Software Foundation; either version 2 of the License, or
21
+ # (at your option) any later version.
22
+ #
23
+ # This program is distributed in the hope that it will be useful,
24
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
25
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
+ # GNU General Public License for more details.
27
+ #
28
+ # You should have received a copy of the GNU General Public License
29
+ # along with this program; if not, write to the Free Software
30
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31
+ #
32
+ #######################################################################
33
+
34
+ require 'time'
35
+ require 'stringio'
36
+
37
+ # The key class where the actual generation of a PDF file is performed.
38
+ class PDFBeads::PDFBuilder
39
+ require 'pdfbeads/pdfdoc'
40
+ require 'pdfbeads/pdffont'
41
+ require 'pdfbeads/pdflabels'
42
+ require 'pdfbeads/pdftoc'
43
+
44
+ @@cmodes = Hash[
45
+ 'BilevelType' => '/DeviceGray',
46
+ 'GrayscaleType' => '/DeviceGray',
47
+ 'PaletteType' => '/Indexed',
48
+ 'PaletteMatteType' => '/Indexed',
49
+ 'TrueColorType' => '/DeviceRGB',
50
+ 'TrueColorMatteType' => '/DeviceRGB',
51
+ 'ColorSeparationType' => '/DeviceCMYK',
52
+ 'ColorSeparationMatteType' => '/DeviceCMYK',
53
+ 'PaletteBilevelMatteType' => '/DeviceGray'
54
+ ]
55
+ @@cmodes.default = '/DeviceRGB'
56
+
57
+ def initialize( pdfargs )
58
+ @pdfargs = pdfargs
59
+ @now = Time.now()
60
+ @doc = Doc.new()
61
+ @fdata = FontDataProvider.new()
62
+
63
+ @dictpath = ''
64
+ @dictobj = nil
65
+ end
66
+
67
+ def process( pagefiles,st_format )
68
+ labels = toc = nil
69
+ labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
70
+ toc = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
71
+ meta = parseMeta( @pdfargs[:meta] )
72
+
73
+ cat = XObj.new(Hash[
74
+ 'Type' => '/Catalog',
75
+ 'PageLayout' => "/#{@pdfargs[:pagelayout]}"
76
+ ])
77
+ @doc.addObject(cat)
78
+
79
+ offsign = 'Z'
80
+ if @now.gmt_offset > 0
81
+ offsign = "+"
82
+ else
83
+ offsign = "-"
84
+ end
85
+ creationDate = sprintf( "D:%04d%02d%02d%02d%02d%02d%s",
86
+ @now.year, @now.month, @now.day, @now.hour, @now.min, @now.sec, offsign )
87
+ unless offsign.eql? 'Z'
88
+ gmt_mins = @now.gmt_offset/60
89
+ creationDate << sprintf( "%02d'%02d", gmt_mins/60, gmt_mins%60 )
90
+ end
91
+ info = XObj.new(Hash[
92
+ 'Creator' => "(PDFBeads)",
93
+ 'Producer' => "(PDFBeads)",
94
+ 'CreationDate' => "(#{creationDate})"
95
+ ])
96
+ @doc.addObject(info)
97
+ meta.each_key do |key|
98
+ info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
99
+ end
100
+
101
+ out = XObj.new(Hash[
102
+ 'Type' => '/Outlines',
103
+ 'Count' => 0
104
+ ])
105
+ @doc.addObject(out)
106
+ cat.addToDict('Outlines', ref(out.getID))
107
+
108
+ pages = XObj.new(Hash[
109
+ 'Type' => '/Pages'
110
+ ])
111
+ @doc.addObject(pages)
112
+ cat.addToDict('Pages', ref(pages.getID))
113
+
114
+ creator = XObj.new(Hash[
115
+ 'Subtype' => '/Artwork',
116
+ 'Creator' => "(PDFBeads)",
117
+ 'Feature' => '(Layers)'
118
+ ])
119
+ @doc.addObject(creator)
120
+
121
+ ocFore = XObj.new(Hash[
122
+ 'Type' => '/OCG',
123
+ 'Name' => '(Foreground)',
124
+ 'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
125
+ 'Intent' => '[/View/Design]'
126
+ ])
127
+ @doc.addObject(ocFore)
128
+ ocBack = XObj.new({
129
+ 'Type' => '/OCG',
130
+ 'Name' => '(Background)',
131
+ 'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
132
+ 'Intent' => '[/View/Design]'
133
+ })
134
+ @doc.addObject(ocBack)
135
+ cat.addToDict('OCProperties',
136
+ sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState (ON) /Order[%s %s] >>>>",
137
+ ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))
138
+
139
+ page_objs = Array.new()
140
+ pages_by_num = Hash.new()
141
+ symd = nil
142
+ font = nil
143
+ pidx = 0
144
+
145
+ if labels != nil and labels.length > 0
146
+ nTree = "<</Nums[\n"
147
+ labels.each do |rng|
148
+ ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first.to_text
149
+
150
+ nTree << "#{rng[:first]} << "
151
+ nTree << "/P (\xFE\xFF#{ltitl.to_text}) " if rng.has_key? :prefix
152
+ nTree << "/S /#{rng[:style]} " if rng.has_key? :style
153
+ nTree << "/St #{rng[:start]}" if rng.has_key? :start
154
+ nTree << ">>\n"
155
+ end
156
+
157
+ nTree << "]\n>>"
158
+ cat.addToDict('PageLabels', nTree)
159
+ cur_range_id = 0
160
+ end
161
+
162
+ needs_font = false
163
+ fonts = encodings = nil
164
+ pagefiles.each do |p|
165
+ unless p.hocr_path.nil?
166
+ needs_font = true
167
+ break
168
+ end
169
+ end
170
+
171
+ if needs_font
172
+ fonts = Array.new()
173
+ encodings = [ [' '] ]
174
+ fdict = XObj.new( Hash[] )
175
+ @doc.addObject( fdict )
176
+
177
+ descr = XObj.new( Hash[
178
+ 'Type' => '/FontDescriptor',
179
+ 'BaseFont' => '/Times-Roman',
180
+ ] )
181
+ @fdata.header.each_key do |key|
182
+ descr.addToDict( key,@fdata.header[key] )
183
+ end
184
+ @doc.addObject( descr )
185
+ end
186
+
187
+ pagefiles.each do |p|
188
+ procSet = ['/PDF', '/ImageB']
189
+ c_str = ''
190
+ doc_objs = Array.new()
191
+ lastimg = 0
192
+
193
+ width = p.width; height = p.height
194
+ xres = p.x_res; yres = p.y_res
195
+ pwidth = width.to_f / xres * 72
196
+ pheight = height.to_f / yres * 72
197
+
198
+ p.stencils.each do |s|
199
+ if st_format.eql? 'JBIG2'
200
+ xobj,width,height,xres,yres = loadJBIG2Page( s[:jbig2path],s[:jbig2dict],ref(ocFore.getID) )
201
+ else
202
+ xobj,width,height,xres,yres = loadCCITTPage( s[:path],ref(ocFore.getID) )
203
+ end
204
+ break if xobj.nil?
205
+
206
+ color = s[:rgb].join(' ') << ' rg'
207
+ doc_objs << xobj
208
+
209
+ c_str << "#{color} /Im#{lastimg} Do "
210
+ lastimg += 1
211
+ end
212
+
213
+ fg_image = bg_image = nil
214
+ fg_image = loadImage( p.fg_layer,ocFore.getID,procSet ) unless p.fg_layer.nil?
215
+ bg_image = loadImage( p.bg_layer,ocBack.getID,procSet ) unless p.bg_layer.nil?
216
+
217
+ contents = XObj.new(Hash[
218
+ 'Filter' => '/FlateDecode'
219
+ ])
220
+ resobj = XObj.new(Hash.new())
221
+ resources = XObj.new(Hash[
222
+ 'XObject' => ref(resobj.getID)
223
+ ])
224
+
225
+ unless fg_image.nil?
226
+ xobj = doc_objs[0]
227
+ fg_image.addToDict('SMask', ref(xobj.getID))
228
+ xobj.removeFromDict('ImageMask')
229
+ xobj.addToDict('Decode', '[1 0]')
230
+ resobj.addToDict('Im0', ref(fg_image.getID))
231
+ doc_objs << fg_image
232
+ c_str = '/Im0 Do '
233
+ else
234
+ doc_objs.each_index do |i|
235
+ resobj.addToDict( "Im#{i}", ref(doc_objs[i].getID) )
236
+ end
237
+ end
238
+
239
+ unless bg_image.nil?
240
+ c_str = "/Im#{resobj.dictLength} Do " << c_str
241
+ resobj.addToDict( "Im#{resobj.dictLength}", ref(bg_image.getID) )
242
+ doc_objs << bg_image
243
+ end
244
+ c_str = sprintf( "q %.2f 0 0 %.2f 0 0 cm %sQ",pwidth,pheight,c_str )
245
+
246
+ doc_objs.concat( [contents, resobj, resources] )
247
+
248
+ hocr = nil
249
+ unless p.hocr_path.nil?
250
+ hocr = open( p.hocr_path ) { |f| Hpricot.parse( f ) }
251
+ procSet << '/Text'
252
+ c_str << getPDFText( hocr,pheight,72.0/xres,72.0/yres,encodings )
253
+ end
254
+
255
+ contents.reinit( Hash[
256
+ 'Filter' => '/FlateDecode'
257
+ ], Zlib::Deflate.deflate( c_str,9 ) )
258
+ resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
259
+ resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil?
260
+
261
+ page = XObj.new(Hash[
262
+ 'Type' => '/Page',
263
+ 'Parent' => "#{pages.getID} 0 R",
264
+ 'MediaBox' => sprintf( "[ 0 0 %.02f %.02f ]",pwidth,pheight ),
265
+ 'Contents' => ref( contents.getID ),
266
+ 'Resources' => ref( resources.getID )
267
+ ])
268
+ # By default acroread uses /DeviceCMYK as a transparency blending space,
269
+ # so adding an SMask image to a page would result to colors being shifted,
270
+ # uless we take a special care of this. For more details see
271
+ # http://comments.gmane.org/gmane.comp.tex.pdftex/3747
272
+ unless fg_image.nil?
273
+ cspace = '/DeviceRGB'
274
+ cspace = fg_image.getFromDict( 'ColorSpace' ) if fg_image.hasInDict( 'ColorSpace' )
275
+ page.addToDict( 'Group', "<< /S /Transparency /CS #{cspace} >>" )
276
+ end
277
+ doc_objs << page
278
+ doc_objs.each{ |x| @doc.addObject(x) }
279
+ page_objs << page
280
+
281
+ pages.addToDict( 'Count', page_objs.length )
282
+ pages.addToDict( 'Kids', '[' << page_objs.map{|x| ref(x.getID).to_s}.join(' ') << ']' )
283
+
284
+ pkey = pidx + 1
285
+ pkey = labels.getPageLabel( cur_range_id,pidx ) if labels != nil and labels.length > 0
286
+ pages_by_num[pkey] = page.getID
287
+ pidx += 1
288
+ if labels != nil and labels.length > 0
289
+ if cur_range_id < labels.length - 1 and labels[cur_range_id + 1][:first] == pidx
290
+ cur_range_id += 1
291
+ end
292
+ end
293
+
294
+ $stderr.puts("Processed #{p.name}\n")
295
+ $stderr.puts(" Added background image from #{p.bg_layer}\n") unless bg_image.nil?
296
+ $stderr.puts(" Added foreground image from #{p.fg_layer}\n") unless fg_image.nil?
297
+ end
298
+
299
+ if needs_font
300
+ fidx = 1
301
+ encodings.each do |enc|
302
+ font = addFont( descr,enc,"Fnt#{fidx}" )
303
+ fdict.addToDict( "Fnt#{fidx}",ref(font.getID) )
304
+ fonts << font
305
+ fidx += 1
306
+ end
307
+ end
308
+
309
+ if toc != nil and toc.length > 0
310
+ getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
311
+ cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
312
+ cat.addToDict('PageMode', "/UseOutlines")
313
+ end
314
+ end
315
+
316
+ # Output the created PDF file to the disk.
317
+ def output( outpath )
318
+ begin
319
+ if outpath.eql? 'STDOUT'
320
+ out = $stdout
321
+ else
322
+ out = File.open( outpath,'w' )
323
+ end
324
+
325
+ out.binmode if /(win|w)32$/.match( RUBY_PLATFORM )
326
+ out.write( @doc.to_s )
327
+ out.close unless outpath.eql? 'STDOUT'
328
+ rescue
329
+ $stderr.puts( "Error: could not write to #{outpath}" )
330
+ end
331
+ end
332
+
333
+ private
334
+
335
+ def parseMeta( path )
336
+ ret = Hash.new()
337
+ return ret if path.nil? or path.eql? ''
338
+
339
+ keys = [ 'Title', 'Author', 'Subject', 'Keywords' ]
340
+ File.open( path,'r' ) do |fin|
341
+ fin.set_encoding 'UTF-8' if fin.respond_to? :set_encoding
342
+ fin.each do |fl|
343
+ next if /^\#/.match( fl )
344
+
345
+ if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
346
+ key = $1
347
+ if keys.include? key
348
+ begin
349
+ ret[key] = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
350
+ rescue
351
+ $stderr.puts("Error: metadata should be specified in utf-8")
352
+ end
353
+ end
354
+ end
355
+ end
356
+ end
357
+ ret
358
+ end
359
+
360
+ def getOutlineObjs( toc,page_ids,fp_id )
361
+ root = toc[0]
362
+ root[:pdfobj] = XObj.new( Hash[
363
+ 'Type' => '/Outlines',
364
+ 'Count' => root.getChildrenCount
365
+ ])
366
+ @doc.addObject(root[:pdfobj])
367
+
368
+ toc[1..-1].each do |item|
369
+ dest = fp_id
370
+ if page_ids.has_key? item[:ref]
371
+ dest = page_ids[item[:ref]]
372
+ else
373
+ dest = nil
374
+ $stderr.puts("Malformed TOC: there is no page #{item[:ref]} in this document.")
375
+ end
376
+
377
+ item_text = item[:title].to_binary
378
+ item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
379
+ item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
380
+ item[:pdfobj] = XObj.new(Hash[
381
+ 'Title' => "(\xFE\xFF#{item_text.to_text})",
382
+ 'Parent' => ref(item[:parent][:pdfobj].getID),
383
+ ])
384
+ if dest != nil
385
+ item[:pdfobj].addToDict('Dest', "[ #{dest} 0 R /XYZ null null null ]")
386
+ else
387
+ item[:pdfobj].addToDict('C', "[0.75 0.75 0.75]")
388
+ end
389
+
390
+
391
+ if item[:children].length > 0
392
+ cnt = item.getChildrenCount
393
+ if item[:open]
394
+ item[:pdfobj].addToDict('Count', cnt)
395
+ else
396
+ item[:pdfobj].addToDict('Count', -cnt)
397
+ end
398
+ end
399
+
400
+ unless item.has_key? :prev
401
+ item[:parent][:pdfobj].addToDict('First', ref(item[:pdfobj].getID))
402
+ else
403
+ item[:prev][:pdfobj].addToDict('Next', ref(item[:pdfobj].getID))
404
+ item[:pdfobj].addToDict('Prev', ref(item[:prev][:pdfobj].getID))
405
+ end
406
+
407
+ unless item.has_key? :next
408
+ item[:parent][:pdfobj].addToDict('Last', ref(item[:pdfobj].getID))
409
+ end
410
+
411
+ @doc.addObject(item[:pdfobj])
412
+ end
413
+ end
414
+
415
+ # Returns an array containing the coordinates of the bounding box around
416
+ # an element
417
+ def elementCoordinates( element,xscale,yscale )
418
+ out = [0,0,0,0]
419
+
420
+ if element.attributes.to_hash.has_key? 'title'
421
+ if /bbox((\s+\d+){4})/.match(element.attributes.to_hash['title'])
422
+ coords = $1.strip.split(/\s+/)
423
+ out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
424
+ (coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
425
+ end
426
+ end
427
+ return out
428
+ end
429
+
430
+ def getPDFText( hocr,pheight,xscale,yscale,encodings )
431
+ fsize = 10
432
+ cur_enc = encodings[0]
433
+ ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
434
+
435
+ charset = 'utf-8'
436
+ hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
437
+ attrs = el.attributes.to_hash
438
+ charset = $1 if attrs.has_key? 'content' and
439
+ /\Atext\/html;charset=([A-Za-z0-9-]+)\Z/i.match( attrs['content'] )
440
+ end
441
+
442
+ hocr.search("//span[@class='ocr_line']").each do |line|
443
+ txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
444
+ begin
445
+ txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
446
+ rescue
447
+ txt = ''
448
+ end
449
+ next if txt.eql? ''
450
+ txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
451
+ txt.sub!( /-\Z/, "\xC2\xAD" )
452
+
453
+ bbox = elementCoordinates( line,xscale,yscale )
454
+ ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
455
+ ret << sprintf( "%f %f %f %f %f %f Tm ",
456
+ ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
457
+
458
+ txt8 = ''
459
+ txt.each_char do |char|
460
+ begin
461
+ Iconv.iconv( "utf-16be","utf-8",char )
462
+ rescue
463
+ rawbytes = char.unpack( 'C*' )
464
+ bs = ''
465
+ rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
466
+ $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
467
+ char = '?' * rawbytes.length
468
+ end
469
+
470
+ encoded = false
471
+ unless cur_enc.include? char
472
+ encodings.each_index do |i|
473
+ enc = encodings[i]
474
+ next if enc == cur_enc
475
+
476
+ if enc.include? char
477
+ ret << "<#{txt8}> Tj "
478
+ cur_enc = enc
479
+ ret << "/Fnt#{i + 1} #{fsize} Tf "
480
+ txt8 = ''
481
+ encoded = true
482
+ break
483
+ end
484
+ end
485
+
486
+ unless encoded
487
+ last = encodings[-1]
488
+ if last.length < 256
489
+ last << char
490
+ else
491
+ last = [ ' ',char ]
492
+ encodings << last
493
+ end
494
+
495
+ if cur_enc != last
496
+ ret << "<#{txt8}> Tj "
497
+ cur_enc = last
498
+ ret << "/Fnt#{encodings.length} #{fsize} Tf "
499
+ txt8 = ''
500
+ end
501
+ end
502
+ end
503
+
504
+ txt8 << sprintf( "%02X",cur_enc.index(char) )
505
+ end
506
+
507
+ ret << "<#{txt8}> Tj " unless txt8.eql? ''
508
+ end
509
+
510
+ ret << "ET "
511
+ return ret
512
+ end
513
+
514
+ def addFont( descr,fenc,fname )
515
+ enc_str = @fdata.getEncoding( fenc ).join( ' ' )
516
+ enc = XObj.new( Hash[
517
+ 'Type' => "/Encoding",
518
+ 'Differences' => "[ 0 #{enc_str} ]"
519
+ ])
520
+ @doc.addObject( enc )
521
+
522
+ toUni = @fdata.getCMAP( fenc )
523
+ @doc.addObject( toUni )
524
+
525
+ font = XObj.new( Hash[
526
+ 'BaseFont' => '/Times-Roman',
527
+ 'Name' => "/#{fname}",
528
+ 'Subtype' => '/Type1',
529
+ 'Type' => '/Font',
530
+ 'FirstChar' => 0,
531
+ 'LastChar' => fenc.length - 1,
532
+ 'Widths' => '[ ' << @fdata.getWidths(fenc).map{|w| w.to_s}.join(' ') << ' ]',
533
+ 'FontDescriptor' => ref(descr.getID),
534
+ 'ToUnicode' => ref(toUni.getID),
535
+ ] )
536
+ if enc.nil?
537
+ font.addToDict( 'Encoding','/WinAnsiEncoding' )
538
+ else
539
+ font.addToDict( 'Encoding',ref(enc.getID) )
540
+ end
541
+ @doc.addObject( font )
542
+ return font
543
+ end
544
+
545
+ def loadCCITTPage( path,ocref )
546
+ stencil = ImageInspector.new( path )
547
+ return nil if stencil.width.nil?
548
+
549
+ width = stencil.width
550
+ height = stencil.height
551
+ xres = stencil.x_dpi
552
+ yres = stencil.y_dpi
553
+ rows_per_strip = stencil.tags[0x116][0]
554
+
555
+ unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
556
+ img = ImageList.new( path )
557
+ imgdata = img.to_blob{
558
+ self.format = 'TIFF'
559
+ self.define( 'TIFF','rows-per-strip',height )
560
+ self.compression = Group4Compression
561
+ }
562
+ stencil = ImageInspector.new( StringIO.new(imgdata) )
563
+ img.destroy!
564
+ end
565
+ body = stencil.getRawData
566
+
567
+ xobj = XObj.new(Hash[
568
+ 'Type' => '/XObject',
569
+ 'Subtype' => '/Image',
570
+ 'OC' => ocref,
571
+ 'Width' => width.to_s,
572
+ 'Height' => height.to_s,
573
+ 'ImageMask' => 'true',
574
+ 'ColorSpace' => '/DeviceGray',
575
+ 'BitsPerComponent' => '1',
576
+ 'Filter' => '/CCITTFaxDecode',
577
+ 'DecodeParms' => "<< /Columns #{width} /K -1 >>"
578
+ ], body)
579
+
580
+ return [ xobj,width,height,xres,yres ]
581
+ end
582
+
583
+ def loadJBIG2Page( path,dictpath,ocref )
584
+ begin
585
+ jbig2 = File.open( path,'r' ).read
586
+ width, height, xres, yres = jbig2[11...27].unpack( 'NNNN' )
587
+ unless @dictpath.eql? dictpath
588
+ symd_f = File.open( dictpath,'r' ).read
589
+ symd_o = @doc.addObject( XObj.new(Hash.new(),symd_f) )
590
+ @dictpath = dictpath
591
+ @dictobj = symd_o
592
+ end
593
+ rescue
594
+ $stderr.puts( "Page not completed: could not access #{path}" )
595
+ return nil
596
+ end
597
+
598
+ xobj = XObj.new(Hash[
599
+ 'Type' => '/XObject',
600
+ 'Subtype' => '/Image',
601
+ 'OC' => ocref,
602
+ 'Width' => width.to_s,
603
+ 'Height' => height.to_s,
604
+ 'ImageMask' => 'true',
605
+ 'ColorSpace' => '/DeviceGray',
606
+ 'BitsPerComponent' => '1',
607
+ 'Filter' => '/JBIG2Decode',
608
+ 'DecodeParms' => "<< /JBIG2Globals #{@dictobj.getID} 0 R >>"
609
+ ], jbig2)
610
+
611
+ return [ xobj,width,height,xres,yres ]
612
+ end
613
+
614
+ def loadImage( impath,ocID,procSet )
615
+ insp = ImageInspector.new( impath )
616
+ return nil if insp.width.nil?
617
+
618
+ # JPEG, JPEG2000 and PNG images can be handled directly. We also can
619
+ # handle uncompressed TIFF files, although it is very unlikely someone
620
+ # would use them for page background. Unfortunately things are more
621
+ # difficult for compressed TIFF images, as they normally contain several
622
+ # compressed chunks, so that we can't just concatenate them. So for all
623
+ # other image types we just call ImageMagick to convert them into a
624
+ # zip-compressed PNG, and then retrieve the raw data from that PNG image.
625
+ unless [ :JPEG, :JPEG2000, :PNG ].include? insp.format or
626
+ ( insp.format.eql? :TIFF and ( insp.compression.eql? :NoCompression or
627
+ ( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))
628
+
629
+ img = ImageList.new( impath )
630
+ imgdata = img.to_blob{
631
+ self.format = 'PNG'
632
+ self.quality = 95
633
+ self.compression = ZipCompression
634
+ }
635
+ insp = ImageInspector.new( StringIO.new(imgdata) )
636
+ img.destroy!
637
+ end
638
+ rawdata = insp.getRawData
639
+ cspace = "/#{insp.cspace}"
640
+ fmt = insp.format
641
+ imgcompr = insp.compression
642
+ per_comp = 1
643
+
644
+ if cspace.eql? '/Indexed' and not insp.palette.nil?
645
+ cspace = '/DeviceGray'; cpal = insp.palette
646
+ rgb = false
647
+ cpal.each do |c|
648
+ if c[0] != c[1] or c[0] != c[2]
649
+ cspace = '/DeviceRGB'
650
+ rgb = true
651
+ break
652
+ end
653
+ end
654
+
655
+ cspace = "[/Indexed #{cspace} #{cpal.length - 1} < "
656
+ cpal.each do |c|
657
+ cspace << sprintf( "%02x ",c[0] )
658
+ cspace << sprintf( "%02x %02x ",c[1],c[2] ) if rgb
659
+ end
660
+ cspace << '>]'
661
+
662
+ procSet << '/ImageI' unless procSet.include? '/ImageI'
663
+
664
+ elsif not cspace.eql? '/DeviceGray' and not procSet.include? '/ImageC'
665
+ procSet << '/ImageC'
666
+ end
667
+
668
+ if cspace.eql? '/DeviceRGB'
669
+ per_comp = 3
670
+ elsif cspace.eql? '/DeviceCMYK'
671
+ per_comp = 4
672
+ end
673
+ image = XObj.new( Hash[
674
+ 'Type' => '/XObject',
675
+ 'Subtype' => '/Image',
676
+ 'OC' => ref( ocID ),
677
+ 'Width' => insp.width,
678
+ 'Height' => insp.height,
679
+ 'Interpolate' => 'true'
680
+ ], rawdata )
681
+
682
+ unless fmt.eql? :JPEG2000
683
+ image.addToDict( 'BitsPerComponent',insp.depth )
684
+ image.addToDict( 'ColorSpace',"#{cspace}" )
685
+ end
686
+ image.addToDict( 'Filter',"/#{imgcompr}" ) unless insp.compression.eql? :NoCompression
687
+ if [:PNG, :TIFF].include? fmt
688
+ predictor = (fmt.eql? :PNG) ? 15 : 2
689
+ image.addToDict( 'DecodeParms',
690
+ "<< /Predictor #{predictor} /Colors #{per_comp} /BitsPerComponent #{insp.depth} /Columns #{insp.width} >>" )
691
+ end
692
+ return image
693
+ end
694
+
695
+ def ref(x)
696
+ return "#{x} 0 R"
697
+ end
698
+ end
699
+