pdfbeads 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/pdfbeads.rb ADDED
@@ -0,0 +1,93 @@
1
+ # encoding: UTF-8
2
+
3
+ ######################################################################
4
+ #
5
+ # PDFBeads -- convert scanned images to a single PDF file
6
+ # Version 1.0
7
+ #
8
+ # Unlike other PDF creation tools, this utility attempts to implement
9
+ # the approach typically used for DjVu books. Its key feature is
10
+ # separating scanned text (typically black, but indexed images with
11
+ # a small number of colors are also accepted) from halftone images
12
+ # placed into a background layer.
13
+ #
14
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
15
+ # All rights reserved.
16
+ #
17
+ # This program is free software; you can redistribute it and/or modify
18
+ # it under the terms of the GNU General Public License as published by
19
+ # the Free Software Foundation; either version 2 of the License, or
20
+ # (at your option) any later version.
21
+ #
22
+ # This program is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
+ # GNU General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU General Public License
28
+ # along with this program; if not, write to the Free Software
29
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30
+ #
31
+ #######################################################################
32
+
33
+ require 'iconv'
34
+ require 'zlib'
35
+
36
+ require 'RMagick'
37
+ include Magick
38
+
39
+ begin
40
+ require 'hpricot'
41
+ $has_hpricot = true
42
+ rescue LoadError
43
+ $stderr.puts( "Warning: the hpricot extension is not available." )
44
+ $stderr.puts( " pdfbeads will not be able to read OCR data from hOCR files." )
45
+ $has_hpricot = false
46
+ end
47
+
48
+ unless ''.respond_to? :ord
49
+ $KCODE = 'u'
50
+ require 'jcode'
51
+ end
52
+
53
+ class String
54
+ # Protect strings which are supposed be treated as a raw sequence of bytes.
55
+ # This is important for Ruby 1.9. For earlier versions the method just
56
+ # does nothing.
57
+ unless self.method_defined? :to_binary
58
+ def to_binary()
59
+ force_encoding 'ASCII-8BIT' if respond_to? :force_encoding
60
+ return self
61
+ end
62
+ end
63
+
64
+ # In ruby 1.9 sometimes we have to mark a string as UTF-8 encoded
65
+ # even if we certainly know it is not.
66
+ unless self.method_defined? :to_text
67
+ def to_text()
68
+ force_encoding 'UTF-8' if respond_to? :force_encoding
69
+ return self
70
+ end
71
+ end
72
+
73
+ # Get a Unicode ordinal for an encoded character (there is no standard method
74
+ # in Ruby < 1.9 to do that)
75
+ unless self.method_defined? :ord
76
+ def ord()
77
+ begin
78
+ return Iconv.iconv( 'utf-16be','utf-8',self ).first.unpack('n')[0]
79
+ rescue
80
+ return 0x3F # Question mark
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ require 'imageinspector'
87
+
88
+ module PDFBeads
89
+ VERSION = '1.0'
90
+ require 'pdfbeads/pdfbuilder'
91
+ require 'pdfbeads/pdfpage'
92
+ end
93
+
@@ -0,0 +1,699 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ ######################################################################
5
+ #
6
+ # PDFBeads -- convert scanned images to a single PDF file
7
+ # Version 1.0
8
+ #
9
+ # Unlike other PDF creation tools, this utility attempts to implement
10
+ # the approach typically used for DjVu books. Its key feature is
11
+ # separating scanned text (typically black, but indexed images with
12
+ # a small number of colors are also accepted) from halftone images
13
+ # placed into a background layer.
14
+ #
15
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
16
+ # All rights reserved.
17
+ #
18
+ # This program is free software; you can redistribute it and/or modify
19
+ # it under the terms of the GNU General Public License as published by
20
+ # the Free Software Foundation; either version 2 of the License, or
21
+ # (at your option) any later version.
22
+ #
23
+ # This program is distributed in the hope that it will be useful,
24
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
25
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
+ # GNU General Public License for more details.
27
+ #
28
+ # You should have received a copy of the GNU General Public License
29
+ # along with this program; if not, write to the Free Software
30
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31
+ #
32
+ #######################################################################
33
+
34
+ require 'time'
35
+ require 'stringio'
36
+
37
+ # The key class where the actual generation of a PDF file is performed.
38
+ class PDFBeads::PDFBuilder
39
+ require 'pdfbeads/pdfdoc'
40
+ require 'pdfbeads/pdffont'
41
+ require 'pdfbeads/pdflabels'
42
+ require 'pdfbeads/pdftoc'
43
+
44
+ @@cmodes = Hash[
45
+ 'BilevelType' => '/DeviceGray',
46
+ 'GrayscaleType' => '/DeviceGray',
47
+ 'PaletteType' => '/Indexed',
48
+ 'PaletteMatteType' => '/Indexed',
49
+ 'TrueColorType' => '/DeviceRGB',
50
+ 'TrueColorMatteType' => '/DeviceRGB',
51
+ 'ColorSeparationType' => '/DeviceCMYK',
52
+ 'ColorSeparationMatteType' => '/DeviceCMYK',
53
+ 'PaletteBilevelMatteType' => '/DeviceGray'
54
+ ]
55
+ @@cmodes.default = '/DeviceRGB'
56
+
57
+ def initialize( pdfargs )
58
+ @pdfargs = pdfargs
59
+ @now = Time.now()
60
+ @doc = Doc.new()
61
+ @fdata = FontDataProvider.new()
62
+
63
+ @dictpath = ''
64
+ @dictobj = nil
65
+ end
66
+
67
+ def process( pagefiles,st_format )
68
+ labels = toc = nil
69
+ labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
70
+ toc = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
71
+ meta = parseMeta( @pdfargs[:meta] )
72
+
73
+ cat = XObj.new(Hash[
74
+ 'Type' => '/Catalog',
75
+ 'PageLayout' => "/#{@pdfargs[:pagelayout]}"
76
+ ])
77
+ @doc.addObject(cat)
78
+
79
+ offsign = 'Z'
80
+ if @now.gmt_offset > 0
81
+ offsign = "+"
82
+ else
83
+ offsign = "-"
84
+ end
85
+ creationDate = sprintf( "D:%04d%02d%02d%02d%02d%02d%s",
86
+ @now.year, @now.month, @now.day, @now.hour, @now.min, @now.sec, offsign )
87
+ unless offsign.eql? 'Z'
88
+ gmt_mins = @now.gmt_offset/60
89
+ creationDate << sprintf( "%02d'%02d", gmt_mins/60, gmt_mins%60 )
90
+ end
91
+ info = XObj.new(Hash[
92
+ 'Creator' => "(PDFBeads)",
93
+ 'Producer' => "(PDFBeads)",
94
+ 'CreationDate' => "(#{creationDate})"
95
+ ])
96
+ @doc.addObject(info)
97
+ meta.each_key do |key|
98
+ info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
99
+ end
100
+
101
+ out = XObj.new(Hash[
102
+ 'Type' => '/Outlines',
103
+ 'Count' => 0
104
+ ])
105
+ @doc.addObject(out)
106
+ cat.addToDict('Outlines', ref(out.getID))
107
+
108
+ pages = XObj.new(Hash[
109
+ 'Type' => '/Pages'
110
+ ])
111
+ @doc.addObject(pages)
112
+ cat.addToDict('Pages', ref(pages.getID))
113
+
114
+ creator = XObj.new(Hash[
115
+ 'Subtype' => '/Artwork',
116
+ 'Creator' => "(PDFBeads)",
117
+ 'Feature' => '(Layers)'
118
+ ])
119
+ @doc.addObject(creator)
120
+
121
+ ocFore = XObj.new(Hash[
122
+ 'Type' => '/OCG',
123
+ 'Name' => '(Foreground)',
124
+ 'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
125
+ 'Intent' => '[/View/Design]'
126
+ ])
127
+ @doc.addObject(ocFore)
128
+ ocBack = XObj.new({
129
+ 'Type' => '/OCG',
130
+ 'Name' => '(Background)',
131
+ 'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
132
+ 'Intent' => '[/View/Design]'
133
+ })
134
+ @doc.addObject(ocBack)
135
+ cat.addToDict('OCProperties',
136
+ sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState (ON) /Order[%s %s] >>>>",
137
+ ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))
138
+
139
+ page_objs = Array.new()
140
+ pages_by_num = Hash.new()
141
+ symd = nil
142
+ font = nil
143
+ pidx = 0
144
+
145
+ if labels != nil and labels.length > 0
146
+ nTree = "<</Nums[\n"
147
+ labels.each do |rng|
148
+ ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first.to_text
149
+
150
+ nTree << "#{rng[:first]} << "
151
+ nTree << "/P (\xFE\xFF#{ltitl.to_text}) " if rng.has_key? :prefix
152
+ nTree << "/S /#{rng[:style]} " if rng.has_key? :style
153
+ nTree << "/St #{rng[:start]}" if rng.has_key? :start
154
+ nTree << ">>\n"
155
+ end
156
+
157
+ nTree << "]\n>>"
158
+ cat.addToDict('PageLabels', nTree)
159
+ cur_range_id = 0
160
+ end
161
+
162
+ needs_font = false
163
+ fonts = encodings = nil
164
+ pagefiles.each do |p|
165
+ unless p.hocr_path.nil?
166
+ needs_font = true
167
+ break
168
+ end
169
+ end
170
+
171
+ if needs_font
172
+ fonts = Array.new()
173
+ encodings = [ [' '] ]
174
+ fdict = XObj.new( Hash[] )
175
+ @doc.addObject( fdict )
176
+
177
+ descr = XObj.new( Hash[
178
+ 'Type' => '/FontDescriptor',
179
+ 'BaseFont' => '/Times-Roman',
180
+ ] )
181
+ @fdata.header.each_key do |key|
182
+ descr.addToDict( key,@fdata.header[key] )
183
+ end
184
+ @doc.addObject( descr )
185
+ end
186
+
187
+ pagefiles.each do |p|
188
+ procSet = ['/PDF', '/ImageB']
189
+ c_str = ''
190
+ doc_objs = Array.new()
191
+ lastimg = 0
192
+
193
+ width = p.width; height = p.height
194
+ xres = p.x_res; yres = p.y_res
195
+ pwidth = width.to_f / xres * 72
196
+ pheight = height.to_f / yres * 72
197
+
198
+ p.stencils.each do |s|
199
+ if st_format.eql? 'JBIG2'
200
+ xobj,width,height,xres,yres = loadJBIG2Page( s[:jbig2path],s[:jbig2dict],ref(ocFore.getID) )
201
+ else
202
+ xobj,width,height,xres,yres = loadCCITTPage( s[:path],ref(ocFore.getID) )
203
+ end
204
+ break if xobj.nil?
205
+
206
+ color = s[:rgb].join(' ') << ' rg'
207
+ doc_objs << xobj
208
+
209
+ c_str << "#{color} /Im#{lastimg} Do "
210
+ lastimg += 1
211
+ end
212
+
213
+ fg_image = bg_image = nil
214
+ fg_image = loadImage( p.fg_layer,ocFore.getID,procSet ) unless p.fg_layer.nil?
215
+ bg_image = loadImage( p.bg_layer,ocBack.getID,procSet ) unless p.bg_layer.nil?
216
+
217
+ contents = XObj.new(Hash[
218
+ 'Filter' => '/FlateDecode'
219
+ ])
220
+ resobj = XObj.new(Hash.new())
221
+ resources = XObj.new(Hash[
222
+ 'XObject' => ref(resobj.getID)
223
+ ])
224
+
225
+ unless fg_image.nil?
226
+ xobj = doc_objs[0]
227
+ fg_image.addToDict('SMask', ref(xobj.getID))
228
+ xobj.removeFromDict('ImageMask')
229
+ xobj.addToDict('Decode', '[1 0]')
230
+ resobj.addToDict('Im0', ref(fg_image.getID))
231
+ doc_objs << fg_image
232
+ c_str = '/Im0 Do '
233
+ else
234
+ doc_objs.each_index do |i|
235
+ resobj.addToDict( "Im#{i}", ref(doc_objs[i].getID) )
236
+ end
237
+ end
238
+
239
+ unless bg_image.nil?
240
+ c_str = "/Im#{resobj.dictLength} Do " << c_str
241
+ resobj.addToDict( "Im#{resobj.dictLength}", ref(bg_image.getID) )
242
+ doc_objs << bg_image
243
+ end
244
+ c_str = sprintf( "q %.2f 0 0 %.2f 0 0 cm %sQ",pwidth,pheight,c_str )
245
+
246
+ doc_objs.concat( [contents, resobj, resources] )
247
+
248
+ hocr = nil
249
+ unless p.hocr_path.nil?
250
+ hocr = open( p.hocr_path ) { |f| Hpricot.parse( f ) }
251
+ procSet << '/Text'
252
+ c_str << getPDFText( hocr,pheight,72.0/xres,72.0/yres,encodings )
253
+ end
254
+
255
+ contents.reinit( Hash[
256
+ 'Filter' => '/FlateDecode'
257
+ ], Zlib::Deflate.deflate( c_str,9 ) )
258
+ resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
259
+ resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil?
260
+
261
+ page = XObj.new(Hash[
262
+ 'Type' => '/Page',
263
+ 'Parent' => "#{pages.getID} 0 R",
264
+ 'MediaBox' => sprintf( "[ 0 0 %.02f %.02f ]",pwidth,pheight ),
265
+ 'Contents' => ref( contents.getID ),
266
+ 'Resources' => ref( resources.getID )
267
+ ])
268
+ # By default acroread uses /DeviceCMYK as a transparency blending space,
269
+ # so adding an SMask image to a page would result to colors being shifted,
270
+ # uless we take a special care of this. For more details see
271
+ # http://comments.gmane.org/gmane.comp.tex.pdftex/3747
272
+ unless fg_image.nil?
273
+ cspace = '/DeviceRGB'
274
+ cspace = fg_image.getFromDict( 'ColorSpace' ) if fg_image.hasInDict( 'ColorSpace' )
275
+ page.addToDict( 'Group', "<< /S /Transparency /CS #{cspace} >>" )
276
+ end
277
+ doc_objs << page
278
+ doc_objs.each{ |x| @doc.addObject(x) }
279
+ page_objs << page
280
+
281
+ pages.addToDict( 'Count', page_objs.length )
282
+ pages.addToDict( 'Kids', '[' << page_objs.map{|x| ref(x.getID).to_s}.join(' ') << ']' )
283
+
284
+ pkey = pidx + 1
285
+ pkey = labels.getPageLabel( cur_range_id,pidx ) if labels != nil and labels.length > 0
286
+ pages_by_num[pkey] = page.getID
287
+ pidx += 1
288
+ if labels != nil and labels.length > 0
289
+ if cur_range_id < labels.length - 1 and labels[cur_range_id + 1][:first] == pidx
290
+ cur_range_id += 1
291
+ end
292
+ end
293
+
294
+ $stderr.puts("Processed #{p.name}\n")
295
+ $stderr.puts(" Added background image from #{p.bg_layer}\n") unless bg_image.nil?
296
+ $stderr.puts(" Added foreground image from #{p.fg_layer}\n") unless fg_image.nil?
297
+ end
298
+
299
+ if needs_font
300
+ fidx = 1
301
+ encodings.each do |enc|
302
+ font = addFont( descr,enc,"Fnt#{fidx}" )
303
+ fdict.addToDict( "Fnt#{fidx}",ref(font.getID) )
304
+ fonts << font
305
+ fidx += 1
306
+ end
307
+ end
308
+
309
+ if toc != nil and toc.length > 0
310
+ getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
311
+ cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
312
+ cat.addToDict('PageMode', "/UseOutlines")
313
+ end
314
+ end
315
+
316
+ # Output the created PDF file to the disk.
317
+ def output( outpath )
318
+ begin
319
+ if outpath.eql? 'STDOUT'
320
+ out = $stdout
321
+ else
322
+ out = File.open( outpath,'w' )
323
+ end
324
+
325
+ out.binmode if /(win|w)32$/.match( RUBY_PLATFORM )
326
+ out.write( @doc.to_s )
327
+ out.close unless outpath.eql? 'STDOUT'
328
+ rescue
329
+ $stderr.puts( "Error: could not write to #{outpath}" )
330
+ end
331
+ end
332
+
333
+ private
334
+
335
+ def parseMeta( path )
336
+ ret = Hash.new()
337
+ return ret if path.nil? or path.eql? ''
338
+
339
+ keys = [ 'Title', 'Author', 'Subject', 'Keywords' ]
340
+ File.open( path,'r' ) do |fin|
341
+ fin.set_encoding 'UTF-8' if fin.respond_to? :set_encoding
342
+ fin.each do |fl|
343
+ next if /^\#/.match( fl )
344
+
345
+ if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
346
+ key = $1
347
+ if keys.include? key
348
+ begin
349
+ ret[key] = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
350
+ rescue
351
+ $stderr.puts("Error: metadata should be specified in utf-8")
352
+ end
353
+ end
354
+ end
355
+ end
356
+ end
357
+ ret
358
+ end
359
+
360
+ def getOutlineObjs( toc,page_ids,fp_id )
361
+ root = toc[0]
362
+ root[:pdfobj] = XObj.new( Hash[
363
+ 'Type' => '/Outlines',
364
+ 'Count' => root.getChildrenCount
365
+ ])
366
+ @doc.addObject(root[:pdfobj])
367
+
368
+ toc[1..-1].each do |item|
369
+ dest = fp_id
370
+ if page_ids.has_key? item[:ref]
371
+ dest = page_ids[item[:ref]]
372
+ else
373
+ dest = nil
374
+ $stderr.puts("Malformed TOC: there is no page #{item[:ref]} in this document.")
375
+ end
376
+
377
+ item_text = item[:title].to_binary
378
+ item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
379
+ item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
380
+ item[:pdfobj] = XObj.new(Hash[
381
+ 'Title' => "(\xFE\xFF#{item_text.to_text})",
382
+ 'Parent' => ref(item[:parent][:pdfobj].getID),
383
+ ])
384
+ if dest != nil
385
+ item[:pdfobj].addToDict('Dest', "[ #{dest} 0 R /XYZ null null null ]")
386
+ else
387
+ item[:pdfobj].addToDict('C', "[0.75 0.75 0.75]")
388
+ end
389
+
390
+
391
+ if item[:children].length > 0
392
+ cnt = item.getChildrenCount
393
+ if item[:open]
394
+ item[:pdfobj].addToDict('Count', cnt)
395
+ else
396
+ item[:pdfobj].addToDict('Count', -cnt)
397
+ end
398
+ end
399
+
400
+ unless item.has_key? :prev
401
+ item[:parent][:pdfobj].addToDict('First', ref(item[:pdfobj].getID))
402
+ else
403
+ item[:prev][:pdfobj].addToDict('Next', ref(item[:pdfobj].getID))
404
+ item[:pdfobj].addToDict('Prev', ref(item[:prev][:pdfobj].getID))
405
+ end
406
+
407
+ unless item.has_key? :next
408
+ item[:parent][:pdfobj].addToDict('Last', ref(item[:pdfobj].getID))
409
+ end
410
+
411
+ @doc.addObject(item[:pdfobj])
412
+ end
413
+ end
414
+
415
+ # Returns an array containing the coordinates of the bounding box around
416
+ # an element
417
+ def elementCoordinates( element,xscale,yscale )
418
+ out = [0,0,0,0]
419
+
420
+ if element.attributes.to_hash.has_key? 'title'
421
+ if /bbox((\s+\d+){4})/.match(element.attributes.to_hash['title'])
422
+ coords = $1.strip.split(/\s+/)
423
+ out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
424
+ (coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
425
+ end
426
+ end
427
+ return out
428
+ end
429
+
430
+ def getPDFText( hocr,pheight,xscale,yscale,encodings )
431
+ fsize = 10
432
+ cur_enc = encodings[0]
433
+ ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
434
+
435
+ charset = 'utf-8'
436
+ hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
437
+ attrs = el.attributes.to_hash
438
+ charset = $1 if attrs.has_key? 'content' and
439
+ /\Atext\/html;charset=([A-Za-z0-9-]+)\Z/i.match( attrs['content'] )
440
+ end
441
+
442
+ hocr.search("//span[@class='ocr_line']").each do |line|
443
+ txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
444
+ begin
445
+ txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
446
+ rescue
447
+ txt = ''
448
+ end
449
+ next if txt.eql? ''
450
+ txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
451
+ txt.sub!( /-\Z/, "\xC2\xAD" )
452
+
453
+ bbox = elementCoordinates( line,xscale,yscale )
454
+ ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
455
+ ret << sprintf( "%f %f %f %f %f %f Tm ",
456
+ ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
457
+
458
+ txt8 = ''
459
+ txt.each_char do |char|
460
+ begin
461
+ Iconv.iconv( "utf-16be","utf-8",char )
462
+ rescue
463
+ rawbytes = char.unpack( 'C*' )
464
+ bs = ''
465
+ rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
466
+ $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
467
+ char = '?' * rawbytes.length
468
+ end
469
+
470
+ encoded = false
471
+ unless cur_enc.include? char
472
+ encodings.each_index do |i|
473
+ enc = encodings[i]
474
+ next if enc == cur_enc
475
+
476
+ if enc.include? char
477
+ ret << "<#{txt8}> Tj "
478
+ cur_enc = enc
479
+ ret << "/Fnt#{i + 1} #{fsize} Tf "
480
+ txt8 = ''
481
+ encoded = true
482
+ break
483
+ end
484
+ end
485
+
486
+ unless encoded
487
+ last = encodings[-1]
488
+ if last.length < 256
489
+ last << char
490
+ else
491
+ last = [ ' ',char ]
492
+ encodings << last
493
+ end
494
+
495
+ if cur_enc != last
496
+ ret << "<#{txt8}> Tj "
497
+ cur_enc = last
498
+ ret << "/Fnt#{encodings.length} #{fsize} Tf "
499
+ txt8 = ''
500
+ end
501
+ end
502
+ end
503
+
504
+ txt8 << sprintf( "%02X",cur_enc.index(char) )
505
+ end
506
+
507
+ ret << "<#{txt8}> Tj " unless txt8.eql? ''
508
+ end
509
+
510
+ ret << "ET "
511
+ return ret
512
+ end
513
+
514
+ def addFont( descr,fenc,fname )
515
+ enc_str = @fdata.getEncoding( fenc ).join( ' ' )
516
+ enc = XObj.new( Hash[
517
+ 'Type' => "/Encoding",
518
+ 'Differences' => "[ 0 #{enc_str} ]"
519
+ ])
520
+ @doc.addObject( enc )
521
+
522
+ toUni = @fdata.getCMAP( fenc )
523
+ @doc.addObject( toUni )
524
+
525
+ font = XObj.new( Hash[
526
+ 'BaseFont' => '/Times-Roman',
527
+ 'Name' => "/#{fname}",
528
+ 'Subtype' => '/Type1',
529
+ 'Type' => '/Font',
530
+ 'FirstChar' => 0,
531
+ 'LastChar' => fenc.length - 1,
532
+ 'Widths' => '[ ' << @fdata.getWidths(fenc).map{|w| w.to_s}.join(' ') << ' ]',
533
+ 'FontDescriptor' => ref(descr.getID),
534
+ 'ToUnicode' => ref(toUni.getID),
535
+ ] )
536
+ if enc.nil?
537
+ font.addToDict( 'Encoding','/WinAnsiEncoding' )
538
+ else
539
+ font.addToDict( 'Encoding',ref(enc.getID) )
540
+ end
541
+ @doc.addObject( font )
542
+ return font
543
+ end
544
+
545
+ def loadCCITTPage( path,ocref )
546
+ stencil = ImageInspector.new( path )
547
+ return nil if stencil.width.nil?
548
+
549
+ width = stencil.width
550
+ height = stencil.height
551
+ xres = stencil.x_dpi
552
+ yres = stencil.y_dpi
553
+ rows_per_strip = stencil.tags[0x116][0]
554
+
555
+ unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
556
+ img = ImageList.new( path )
557
+ imgdata = img.to_blob{
558
+ self.format = 'TIFF'
559
+ self.define( 'TIFF','rows-per-strip',height )
560
+ self.compression = Group4Compression
561
+ }
562
+ stencil = ImageInspector.new( StringIO.new(imgdata) )
563
+ img.destroy!
564
+ end
565
+ body = stencil.getRawData
566
+
567
+ xobj = XObj.new(Hash[
568
+ 'Type' => '/XObject',
569
+ 'Subtype' => '/Image',
570
+ 'OC' => ocref,
571
+ 'Width' => width.to_s,
572
+ 'Height' => height.to_s,
573
+ 'ImageMask' => 'true',
574
+ 'ColorSpace' => '/DeviceGray',
575
+ 'BitsPerComponent' => '1',
576
+ 'Filter' => '/CCITTFaxDecode',
577
+ 'DecodeParms' => "<< /Columns #{width} /K -1 >>"
578
+ ], body)
579
+
580
+ return [ xobj,width,height,xres,yres ]
581
+ end
582
+
583
+ def loadJBIG2Page( path,dictpath,ocref )
584
+ begin
585
+ jbig2 = File.open( path,'r' ).read
586
+ width, height, xres, yres = jbig2[11...27].unpack( 'NNNN' )
587
+ unless @dictpath.eql? dictpath
588
+ symd_f = File.open( dictpath,'r' ).read
589
+ symd_o = @doc.addObject( XObj.new(Hash.new(),symd_f) )
590
+ @dictpath = dictpath
591
+ @dictobj = symd_o
592
+ end
593
+ rescue
594
+ $stderr.puts( "Page not completed: could not access #{path}" )
595
+ return nil
596
+ end
597
+
598
+ xobj = XObj.new(Hash[
599
+ 'Type' => '/XObject',
600
+ 'Subtype' => '/Image',
601
+ 'OC' => ocref,
602
+ 'Width' => width.to_s,
603
+ 'Height' => height.to_s,
604
+ 'ImageMask' => 'true',
605
+ 'ColorSpace' => '/DeviceGray',
606
+ 'BitsPerComponent' => '1',
607
+ 'Filter' => '/JBIG2Decode',
608
+ 'DecodeParms' => "<< /JBIG2Globals #{@dictobj.getID} 0 R >>"
609
+ ], jbig2)
610
+
611
+ return [ xobj,width,height,xres,yres ]
612
+ end
613
+
614
+ def loadImage( impath,ocID,procSet )
615
+ insp = ImageInspector.new( impath )
616
+ return nil if insp.width.nil?
617
+
618
+ # JPEG, JPEG2000 and PNG images can be handled directly. We also can
619
+ # handle uncompressed TIFF files, although it is very unlikely someone
620
+ # would use them for page background. Unfortunately things are more
621
+ # difficult for compressed TIFF images, as they normally contain several
622
+ # compressed chunks, so that we can't just concatenate them. So for all
623
+ # other image types we just call ImageMagick to convert them into a
624
+ # zip-compressed PNG, and then retrieve the raw data from that PNG image.
625
+ unless [ :JPEG, :JPEG2000, :PNG ].include? insp.format or
626
+ ( insp.format.eql? :TIFF and ( insp.compression.eql? :NoCompression or
627
+ ( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))
628
+
629
+ img = ImageList.new( impath )
630
+ imgdata = img.to_blob{
631
+ self.format = 'PNG'
632
+ self.quality = 95
633
+ self.compression = ZipCompression
634
+ }
635
+ insp = ImageInspector.new( StringIO.new(imgdata) )
636
+ img.destroy!
637
+ end
638
+ rawdata = insp.getRawData
639
+ cspace = "/#{insp.cspace}"
640
+ fmt = insp.format
641
+ imgcompr = insp.compression
642
+ per_comp = 1
643
+
644
+ if cspace.eql? '/Indexed' and not insp.palette.nil?
645
+ cspace = '/DeviceGray'; cpal = insp.palette
646
+ rgb = false
647
+ cpal.each do |c|
648
+ if c[0] != c[1] or c[0] != c[2]
649
+ cspace = '/DeviceRGB'
650
+ rgb = true
651
+ break
652
+ end
653
+ end
654
+
655
+ cspace = "[/Indexed #{cspace} #{cpal.length - 1} < "
656
+ cpal.each do |c|
657
+ cspace << sprintf( "%02x ",c[0] )
658
+ cspace << sprintf( "%02x %02x ",c[1],c[2] ) if rgb
659
+ end
660
+ cspace << '>]'
661
+
662
+ procSet << '/ImageI' unless procSet.include? '/ImageI'
663
+
664
+ elsif not cspace.eql? '/DeviceGray' and not procSet.include? '/ImageC'
665
+ procSet << '/ImageC'
666
+ end
667
+
668
+ if cspace.eql? '/DeviceRGB'
669
+ per_comp = 3
670
+ elsif cspace.eql? '/DeviceCMYK'
671
+ per_comp = 4
672
+ end
673
+ image = XObj.new( Hash[
674
+ 'Type' => '/XObject',
675
+ 'Subtype' => '/Image',
676
+ 'OC' => ref( ocID ),
677
+ 'Width' => insp.width,
678
+ 'Height' => insp.height,
679
+ 'Interpolate' => 'true'
680
+ ], rawdata )
681
+
682
+ unless fmt.eql? :JPEG2000
683
+ image.addToDict( 'BitsPerComponent',insp.depth )
684
+ image.addToDict( 'ColorSpace',"#{cspace}" )
685
+ end
686
+ image.addToDict( 'Filter',"/#{imgcompr}" ) unless insp.compression.eql? :NoCompression
687
+ if [:PNG, :TIFF].include? fmt
688
+ predictor = (fmt.eql? :PNG) ? 15 : 2
689
+ image.addToDict( 'DecodeParms',
690
+ "<< /Predictor #{predictor} /Colors #{per_comp} /BitsPerComponent #{insp.depth} /Columns #{insp.width} >>" )
691
+ end
692
+ return image
693
+ end
694
+
695
+ def ref(x)
696
+ return "#{x} 0 R"
697
+ end
698
+ end
699
+