pdfbeads 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ # encoding: UTF-8
2
+
3
+ ######################################################################
4
+ #
5
+ # PDFBeads -- convert scanned images to a single PDF file
6
+ # Version 1.0
7
+ #
8
+ # Unlike other PDF creation tools, this utility attempts to implement
9
+ # the approach typically used for DjVu books. Its key feature is
10
+ # separating scanned text (typically black, but indexed images with
11
+ # a small number of colors are also accepted) from halftone images
12
+ # placed into a background layer.
13
+ #
14
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
15
+ # All rights reserved.
16
+ #
17
+ # This program is free software; you can redistribute it and/or modify
18
+ # it under the terms of the GNU General Public License as published by
19
+ # the Free Software Foundation; either version 2 of the License, or
20
+ # (at your option) any later version.
21
+ #
22
+ # This program is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
+ # GNU General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU General Public License
28
+ # along with this program; if not, write to the Free Software
29
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30
+ #
31
+ #######################################################################
32
+
33
+ # Parse a specification string passed to pdfbeads via its -L (--labels)
34
+ # option and convert it into a sequence of ranges which can be used for
35
+ # building a PageLabels dictionary embeddable into the PDF file.
36
+ # The specification format is based on the PDF format description,
37
+ # section 12.4.2. and looks as follows:
38
+ #
39
+ # * ranges are separated with a semicolon;
40
+ #
41
+ # * each range consists from a physical number of the first page,
42
+ # folowed by a colon and a number format description;
43
+ #
44
+ # * the number format consists from an optional prefix, followed
45
+ # by a percent sign, an optional number indicating the value of the
46
+ # numeric portion for the first page label in the range, and a single
47
+ # Latin letter indicating the desired numbering style;
48
+ #
49
+ # * the following numbering styles are supported:
50
+ # [D] -- Decimal arabic numerals;
51
+ # [R] -- Uppercase roman numerals;
52
+ # [r] -- Lowercase roman numerals;
53
+ # [A] -- Uppercase Romam letters (A to Z for the first 26 pages,
54
+ # AA to ZZ for the next 26, and so on);
55
+ # [a] -- Lowercase letters (a to z for the first 26 pages,
56
+ # aa to zz for the next 26, and so on).
57
+ #
58
+ # For example if a book starts from two unnumbered title pages, followed
59
+ # by 16 pages numbered with Roman digits, and then goes the Arabic numeration,
60
+ # which however starts from 17, then the following label specification
61
+ # string would be appropriate:
62
+ # +"0:Title %D;2:%R;18:%16D"+
63
+
64
+ class PDFBeads::PDFBuilder::PDFLabels < Array
65
+ def initialize( arg )
66
+ descrs = arg.split(/;/)
67
+ descrs.each do |descr|
68
+ rng = Hash.new()
69
+ fields = descr.split(/:/, 2)
70
+ if /\d+/.match( fields[0] )
71
+ rng[:first] = fields[0].to_i
72
+ if fields.length > 1 and /([^%.]*)%?(\d*)([DRrAa]?)/.match(fields[1])
73
+ rng[:prefix] = $1 unless $1 == ''
74
+ rng[:start ] = $2.to_i unless $2 == ''
75
+ rng[:style ] = $3 unless $3 == ''
76
+ end
77
+ push(rng)
78
+ end
79
+ end
80
+ end
81
+
82
+ # Convert a physical page number into the label we would like to be displayed
83
+ # for this page in the PDF viewer.
84
+ def getPageLabel( rng_id,page_id )
85
+ rng = self[rng_id]
86
+ prefix = ''
87
+ start_num = 1
88
+
89
+ start_num = rng[:start] if rng.has_key? :start
90
+ pnum = page_id - rng[:first] + start_num
91
+
92
+ prefix = rng[:prefix] if rng.has_key? :prefix
93
+
94
+ snum = ''
95
+ snum = pnum2string( pnum,rng[:style] ) if rng.has_key? :style
96
+
97
+ return "#{prefix}#{snum}"
98
+ end
99
+
100
+ private
101
+
102
+ def int2roman( num )
103
+ numerals = Hash[
104
+ 1 => "I", 4 => "IV", 5 => "V", 9 => "IX",
105
+ 10 => "X", 40 => "XL", 50 => "L", 90 => "XC",
106
+ 100 => "C", 400 => "CD", 500 => "D", 900 => "CM", 1000 => "M"
107
+ ]
108
+ res = ''
109
+
110
+ numerals.keys.sort{ |a,b| b <=> a }.reverse.each do |val|
111
+ while num >= val
112
+ res << numerals[val]
113
+ num -= val
114
+ end
115
+ end
116
+
117
+ return res
118
+ end
119
+
120
+ def int2ralph( num )
121
+ quot, mod = num.divmod(26)
122
+ return (mod + 96).chr * (quot + 1)
123
+ end
124
+
125
+ def pnum2string( pnum,style )
126
+ case style
127
+ when 'R'
128
+ return int2roman(pnum)
129
+ when 'r'
130
+ return int2roman(pnum).downcase
131
+ when 'A'
132
+ return int2ralph(pnum)
133
+ when 'a'
134
+ return int2ralph(pnum).downcase
135
+ else
136
+ return pnum.to_s
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,466 @@
1
+ # encoding: UTF-8
2
+
3
+ ######################################################################
4
+ #
5
+ # PDFBeads -- convert scanned images to a single PDF file
6
+ # Version 1.0
7
+ #
8
+ # Unlike other PDF creation tools, this utility attempts to implement
9
+ # the approach typically used for DjVu books. Its key feature is
10
+ # separating scanned text (typically black, but indexed images with
11
+ # a small number of colors are also accepted) from halftone images
12
+ # placed into a background layer.
13
+ #
14
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
15
+ # All rights reserved.
16
+ #
17
+ # This program is free software; you can redistribute it and/or modify
18
+ # it under the terms of the GNU General Public License as published by
19
+ # the Free Software Foundation; either version 2 of the License, or
20
+ # (at your option) any later version.
21
+ #
22
+ # This program is distributed in the hope that it will be useful,
23
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
24
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
+ # GNU General Public License for more details.
26
+ #
27
+ # You should have received a copy of the GNU General Public License
28
+ # along with this program; if not, write to the Free Software
29
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30
+ #
31
+ #######################################################################
32
+
33
+ # Represents a set of page images accompanies with auxiliary files
34
+ # needed to build a PDF document.
35
+ class PDFBeads::PageDataProvider < Array
36
+
37
+ # Allows to collect data needed for building an individual page
38
+ # of a PDF document and gives access to those data.
39
+ class PageData
40
+ attr_reader :name, :basename, :s_type, :stencils, :hocr_path
41
+ attr_accessor :width, :height, :x_res, :y_res, :fg_layer, :bg_layer
42
+
43
+ def initialize( path,basename,args,exts,pref )
44
+ @name = path
45
+ @basename = basename
46
+ @s_type = 'b'
47
+ @stencils = Array.new()
48
+ @pageargs = args
49
+ @exts = exts
50
+ @pref = pref
51
+ @bg_layer = @fg_layer = nil
52
+ end
53
+
54
+ def fillStencilArray()
55
+ ret = 0
56
+ force = @pageargs[:force_update]
57
+ fres = @pageargs[:st_resolution]
58
+
59
+ map = Hash[
60
+ :path => @name,
61
+ :rgb => [0.0, 0.0, 0.0],
62
+ ]
63
+
64
+ insp = ImageInspector.new( @name )
65
+ return 0 if insp.width.nil?
66
+
67
+ @width = insp.width; @height = insp.height
68
+ unless fres > 0
69
+ @x_res = insp.x_dpi; @y_res = insp.y_dpi
70
+ else
71
+ @x_res = @y_res = fres
72
+ end
73
+ if insp.depth == 1
74
+ @stencils << map
75
+ $stderr.puts( "Prepared data for processing #{@name}\n" )
76
+ return 1
77
+ end
78
+
79
+ img = ImageList.new( @name )
80
+ # ImageMagick incorrectly identifies indexed PNG images as DirectClass.
81
+ # It also assigns a strange color value to fully opaque areas. So
82
+ # we have to use an independent approach to recognize indexed images.
83
+ unless insp.palette.nil?
84
+ img.class_type = PseudoClass
85
+ ret = processIndexed( img,@pageargs[:maxcolors],force )
86
+ end
87
+ ret = processMixed( img,force,map ) if ret == 0
88
+ img.destroy!
89
+
90
+ # Make sure there are no more RMagick objects
91
+ GC.start
92
+ $stderr.puts( "Prepared data for processing #{@name}\n" )
93
+ return ret
94
+ end
95
+
96
+ def addSupplementaryFiles()
97
+ force = @pageargs[:force_update]
98
+ exts_pattern = @exts.join( '|' )
99
+ pref_pattern = @pref.join( '|' )
100
+
101
+ if @bg_layer.nil?
102
+ bgpath = Dir.entries('.').detect do |f|
103
+ /\A#{@basename}.(bg|sep).(#{pref_pattern})\Z/i.match(f)
104
+ end
105
+ bgpath = Dir.entries('.').detect do |f|
106
+ /\A#{@basename}.(bg|sep).(#{exts_pattern})\Z/i.match(f)
107
+ end if bgpath.nil?
108
+ @bg_layer = bgpath unless bgpath.nil?
109
+
110
+ # If updating auxiliary files is requested and the base image is
111
+ # either monochrome or indexed with just a few colors (i. e. doesn't
112
+ # contain any elements which should be encoded to the background layer),
113
+ # then the *.color.* image (if present) takes priority over any existing
114
+ # *.bg.* and *.fg.* images. So we should regenerate them.
115
+ if bgpath.nil? or ( force and not @s_type.eql? 'c' )
116
+ colorpath = Dir.entries('.').detect do |f|
117
+ /\A#{@basename}.color.(#{exts_pattern})\Z/i.match(f)
118
+ end
119
+ unless colorpath.nil?
120
+ fnames = Array.new()
121
+ separateColor( colorpath )
122
+ end
123
+ end
124
+ end
125
+
126
+ if @fg_layer.nil? and @stencils.length == 1
127
+ fgpath = Dir.entries('.').detect do |f|
128
+ /\A#{@basename}.(fg).(#{exts_pattern})\Z/i.match(f)
129
+ end
130
+ @fg_layer = fgpath unless fgpath.nil?
131
+ end
132
+
133
+ if $has_hpricot
134
+ @hocr_path = Dir.entries('.').detect do |f|
135
+ /\A#{@basename}.(HOCR|HTML?)/i.match(f)
136
+ end
137
+ end
138
+ end
139
+
140
+ def self.fixResolution( img )
141
+ xres = img.x_resolution; yres = img.y_resolution
142
+ if img.units == PixelsPerCentimeterResolution
143
+ img.units = PixelsPerInchResolution
144
+ xres = (xres * 2.54).round
145
+ yres = (yres * 2.54).round
146
+ end
147
+ return [ xres, yres ]
148
+ end
149
+
150
+ private
151
+
152
+ def writeImage( img,path,fmt )
153
+ begin
154
+ img.write( path ) do
155
+ case fmt
156
+ when 'JP2'
157
+ self.define( 'JP2','mode','real' )
158
+ self.define( 'JP2','numrlvls',4 )
159
+ self.define( 'JP2','rate',0.015625 )
160
+ when 'JPG'
161
+ self.quality = 50
162
+ else
163
+ self.compression = ZipCompression
164
+ self.quality = 95
165
+ end
166
+ self.format = fmt
167
+ end
168
+ return true
169
+ rescue
170
+ $stderr.puts( "Error: could not write to #{path}" )
171
+ return false
172
+ end
173
+ end
174
+
175
+ def processIndexed( img,maxcolors,force )
176
+ ret = 0
177
+ ncolors = img.number_colors
178
+ if ncolors <= maxcolors
179
+ @s_type = 'i'
180
+ exc = ( img.alpha? ) ? '#00000000' : 'white'
181
+ for i in ( 0...ncolors )
182
+ color = img.colormap( i )
183
+ px = Pixel.from_color( color )
184
+ unless color.eql? exc
185
+ cpath = "#{@basename}.#{color}.tiff"
186
+ if not File.exists? cpath or force
187
+ bitonal = img.copy
188
+ # Caution: replacing colors in the colormap currently only works
189
+ # if we save the result into a bilevel TIFF file. Otherwise the
190
+ # changes are ignored or produce a strange effect. We still use
191
+ # this method because it allows to reduce the number of memory
192
+ # allocations.
193
+ for j in (0...ncolors)
194
+ crepl = (j == i) ? 'black' : 'white'
195
+ bitonal.colormap( j,crepl )
196
+ end
197
+ bitonal.compress_colormap!
198
+ bitonal.write( cpath ) do
199
+ self.format = 'TIFF'
200
+ self.define( 'TIFF','rows-per-strip',img.rows )
201
+ self.compression = Group4Compression
202
+ end
203
+ bitonal.destroy!
204
+ end
205
+ cmap = Hash[
206
+ :path => cpath,
207
+ :rgb => [px.red.to_f/QuantumRange, px.green.to_f/QuantumRange, px.blue.to_f/QuantumRange]
208
+ ]
209
+ @stencils << cmap
210
+ ret += 1
211
+ end
212
+ end
213
+ end
214
+ return ret
215
+ end
216
+
217
+ def processMixed( img,force,map )
218
+ binpath = "#{@basename}.black.tiff"
219
+ if not File.exists? binpath or force
220
+ im_copy = img.copy; bitonal = im_copy.threshold(1); im_copy.destroy!
221
+ bitonal.write( binpath ){
222
+ self.format = 'TIFF'
223
+ self.define( 'TIFF','rows-per-strip',img.rows )
224
+ self.compression = Group4Compression
225
+ }
226
+ bitonal.destroy!
227
+ end
228
+
229
+ bgf = @pageargs[:bg_format]
230
+ bgpath = "#{@basename}.bg." << bgf.downcase
231
+
232
+ if not File.exists? bgpath or force
233
+ op = img.opaque( 'black','white' ); img.destroy!; img = op;
234
+ if @pageargs[:force_grayscale]
235
+ img.image_type = GrayscaleType
236
+ end
237
+ PageData.fixResolution( img )
238
+ resampled = img.resample(@pageargs[:bg_resolution]); img.destroy!; img = resampled
239
+
240
+ # A hack for some Windows versions of RMagick, which throw an error the
241
+ # first time when Magick.formats is accessed
242
+ begin
243
+ retries = 2
244
+ mfmts = Magick.formats
245
+ rescue
246
+ retry if (retries -= 1 ) > 0
247
+ end
248
+ if bgf.eql? 'JP2' and not mfmts.has_key? 'JP2'
249
+ $stderr.puts( "This version of ImageMagick doesn't support JPEG2000 compression." )
250
+ $stderr.puts( "\tI'll use JPEG compression instead." )
251
+ bgf = 'JPG'
252
+ end
253
+
254
+ writeImage( img,bgpath,bgf )
255
+ end
256
+
257
+ map[:path] = binpath
258
+ @stencils << map
259
+ @s_type= 'c'
260
+ @bg_layer = bgpath
261
+ ret = 1
262
+ end
263
+
264
+ def separateColor( colorpath )
265
+ fmt = @pageargs[:bg_format]
266
+ dpi = @pageargs[:bg_resolution]
267
+
268
+ begin
269
+ img = ImageList.new( colorpath )
270
+ rescue ImageMagickError
271
+ $stderr.puts( "Error reading image file #{colorpath}" )
272
+ return nil
273
+ end
274
+
275
+ begin
276
+ mask = ImageList.new( @name )
277
+ rescue ImageMagickError
278
+ $stderr.puts( "Error reading image file #{@name}" )
279
+ return nil
280
+ end
281
+
282
+ imw = img.columns
283
+ imh = img.rows
284
+
285
+ if @s_type.eql? 'i'
286
+ mask.class_type = PseudoClass
287
+ exc = ( mask.alpha? ) ? '#00000000' : 'white'
288
+ for i in ( 0...mask.number_colors )
289
+ color = mask.colormap( i )
290
+ unless color.eql? exc
291
+ op = mask.opaque( color,'black' )
292
+ mask.destroy!
293
+ mask = op
294
+ end
295
+ end
296
+
297
+ if mask.alpha?
298
+ op = mask.opaque( exc,'white' )
299
+ mask.destroy!
300
+ mask = op
301
+ mask.alpha( DeactivateAlphaChannel )
302
+ end
303
+ mask.compress_colormap!
304
+ end
305
+
306
+ PageData.fixResolution( img )
307
+ mask.resize!( imw,imh ) if mask.columns != imw or mask.rows != imh
308
+
309
+ no_fg = img.composite( mask,CenterGravity,CopyOpacityCompositeOp )
310
+ bg = no_fg.blur_channel( 0,6,AllChannels )
311
+ bg.alpha( DeactivateAlphaChannel )
312
+
313
+ bg.composite!( no_fg,CenterGravity,OverCompositeOp )
314
+ if ( bg.x_resolution != dpi or bg.y_resolution != dpi )
315
+ resampled = bg.resample( dpi ); bg.destroy!; bg = resampled
316
+ end
317
+
318
+ bgpath = "#{@basename}.bg." << fmt.downcase
319
+ @bg_layer = bgpath if writeImage( bg,bgpath,fmt )
320
+
321
+ bg.destroy!
322
+ no_fg.destroy!
323
+
324
+ unless @bg_layer.nil? or @s_type.eql? 'i'
325
+ ksam = mask.negate
326
+ mask.destroy!
327
+
328
+ no_bg = img.composite( ksam,CenterGravity,CopyOpacityCompositeOp )
329
+ fg = no_bg.clone
330
+
331
+ # Resize the image to a tiny size and then back to the original size
332
+ # to achieve the desired color diffusion. The idea is inspired by
333
+ # Anthony Thyssen's http://www.imagemagick.org/Usage/scripts/hole_fill_shepards
334
+ # script, which is intended just for this purpose (i. e. removing undesired
335
+ # areas from the image). However our approach is a bit cruder (but still
336
+ # effective).
337
+ fg.resize!( width=imw/100,height=imh/100,filter=GaussianFilter )
338
+ fg.resize!( width=imw,height=imh,filter=GaussianFilter )
339
+ fg.composite!( no_bg,CenterGravity,OverCompositeOp )
340
+ downs = fg.resample( 100 ); fg.destroy!; fg = downs
341
+ fg.alpha( DeactivateAlphaChannel )
342
+
343
+ fgpath = "#{@basename}.fg." << fmt.downcase
344
+ @fg_layer = fgpath if writeImage( fg,fgpath,fmt )
345
+
346
+ fg.destroy!
347
+ no_bg.destroy!
348
+ ksam.destroy!
349
+ else
350
+ mask.destroy!
351
+ end
352
+ img.destroy!
353
+ # Make sure there are no more RMagick objects still residing in memory
354
+ GC.start
355
+ end
356
+ end
357
+
358
+ # Takes a list of file names and a hash containing a set of options.
359
+ def initialize( files,args )
360
+ @pageargs = args
361
+
362
+ ext_lossless = [ 'PNG','TIFF?' ]
363
+ ext_jpeg = [ 'JPE?G' ]
364
+ ext_jpeg2000 = [ 'JP2','JPX' ]
365
+
366
+ @exts = Array.new()
367
+
368
+ case @pageargs[:bg_format]
369
+ when 'JP2'
370
+ @exts << ext_jpeg2000 << ext_jpeg << ext_lossless
371
+ @pref = Array.new( ext_jpeg2000 )
372
+ when 'JPG'
373
+ @exts << ext_jpeg << ext_jpeg2000 << ext_lossless
374
+ @pref = Array.new( ext_jpeg )
375
+ else
376
+ @exts << ext_lossless << ext_jpeg2000 << ext_jpeg
377
+ @pref = Array.new( ext_lossless )
378
+ end
379
+
380
+ # A hack for some Windows versions of RMagick, which throw an error the
381
+ # first time when Magick.formats is accessed
382
+ begin
383
+ retries = 2
384
+ mfmts = Magick.formats
385
+ rescue
386
+ retry if (retries -= 1 ) > 0
387
+ end
388
+ unless mfmts.has_key? 'JP2'
389
+ @exts.delete_if{ |ext| ext_jpeg2000.include? ext }
390
+ @pref = Array.new( ext_jpeg ) if @pref.include? 'JP2'
391
+ end
392
+
393
+ for fname in files do
394
+ if /\A([^.]*)\.(TIFF?|PNG)\Z/i.match( fname )
395
+ page = PageData.new( fname,$1,args,@exts,@pref )
396
+ scnt = page.fillStencilArray()
397
+ if scnt > 0
398
+ page.addSupplementaryFiles()
399
+ push( page )
400
+ end
401
+ end
402
+ end
403
+ end
404
+
405
+ # A wrapper for the jbig2 encoder. The jbig2 utility is called as many
406
+ # times as needed to encode all pages with the given pages-per-dict value.
407
+ def jbig2Encode()
408
+ per_dict = @pageargs[:pages_per_dict]
409
+ force = @pageargs[:force_update]
410
+
411
+ has_jbig2 = false
412
+ ENV['PATH'].split(':').each do |dir|
413
+ if File.exists?( dir << '/jbig2' )
414
+ has_jbig2 = true
415
+ break
416
+ end
417
+ end
418
+
419
+ unless has_jbig2
420
+ $stderr.puts("JBIG2 compression has been requested, but the encoder is not available.")
421
+ $stderr.puts( " I'll use CCITT Group 4 fax compression instead." )
422
+ return false
423
+ end
424
+
425
+ pidx = 0
426
+ needs_update = force
427
+ toConvert = Array.new()
428
+ each_index do |i|
429
+ p = fetch(i)
430
+ pidx += 1
431
+ p.stencils.each do |s|
432
+ toConvert << s[:path]
433
+ s[:jbig2path] = s[:path].sub( /\.(TIFF?|PNG)\Z/i,'.jbig2' )
434
+ s[:jbig2dict] = toConvert[0].sub( /\.(TIFF?|PNG)\Z/i,'.sym' )
435
+ if needs_update == false
436
+ needs_update = true unless File.exists? s[:jbig2path] and File.exists? s[:jbig2dict]
437
+ end
438
+ end
439
+
440
+ if pidx == per_dict or i == length - 1
441
+ # The jbig2 encoder processes a bunch of files at once, producing
442
+ # pages which depend from a shared dictionary. Thus we can skip this
443
+ # stage only if both the dictionary and each of the individual pages
444
+ # are already found on the disk
445
+ if needs_update
446
+ IO.popen("jbig2 -s -p " << toConvert.join(' ') ) do |f|
447
+ out = f.gets
448
+ $stderr.puts out unless out.nil?
449
+ end
450
+ return false if $?.exitstatus > 0
451
+
452
+ toConvert.each_index do |j|
453
+ oname = sprintf( "output.%04d",j )
454
+ File.rename( oname,toConvert[j].sub( /\.(TIFF?|PNG)\Z/i,'.jbig2' ) ) if File.exists? oname
455
+ end
456
+ File.rename( 'output.sym',toConvert[0].sub( /\.(TIFF?|PNG)\Z/i,'.sym' ) ) if File.exists? 'output.sym'
457
+ end
458
+
459
+ toConvert.clear
460
+ needs_update = force
461
+ pidx = 0
462
+ end
463
+ end
464
+ return true
465
+ end
466
+ end