pdfbeads 1.0.7 → 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,7 +8,7 @@
8
8
  # Unlike other PDF creation tools, this utility attempts to implement
9
9
  # the approach typically used for DjVu books. Its key feature is
10
10
  # separating scanned text (typically black, but indexed images with
11
- # a small number of colors are also accepted) from halftone images
11
+ # a small number of colors are also accepted) from halftone images
12
12
  # placed into a background layer.
13
13
  #
14
14
  # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
@@ -30,14 +30,14 @@
30
30
  #
31
31
  #######################################################################
32
32
 
33
- # Represents a set of page images accompanies with auxiliary files
33
+ # Represents a set of page images accompanied with auxiliary files
34
34
  # needed to build a PDF document.
35
35
  class PDFBeads::PageDataProvider < Array
36
36
 
37
37
  # Allows to collect data needed for building an individual page
38
38
  # of a PDF document and gives access to those data.
39
39
  class PageData
40
- attr_reader :name, :basename, :s_type, :stencils, :hocr_path
40
+ attr_reader :name, :basename, :s_type, :stencils, :hocr_path, :fg_created, :bg_created
41
41
  attr_accessor :width, :height, :x_res, :y_res, :fg_layer, :bg_layer
42
42
 
43
43
  def initialize( path,basename,args,exts,pref )
@@ -49,6 +49,7 @@ class PDFBeads::PageDataProvider < Array
49
49
  @exts = exts
50
50
  @pref = pref
51
51
  @bg_layer = @fg_layer = nil
52
+ @bg_created = @fg_created = false
52
53
  end
53
54
 
54
55
  def fillStencilArray()
@@ -60,6 +61,7 @@ class PDFBeads::PageDataProvider < Array
60
61
  map = Hash[
61
62
  :path => @name,
62
63
  :rgb => [0.0, 0.0, 0.0],
64
+ :created => false
63
65
  ]
64
66
 
65
67
  insp = ImageInspector.new( @name )
@@ -72,7 +74,7 @@ class PDFBeads::PageDataProvider < Array
72
74
  @x_res = @y_res = fres
73
75
  end
74
76
 
75
- if insp.depth == 1
77
+ if insp.depth == 1 and insp.trans.nil?
76
78
  @stencils << map
77
79
  ret = 1
78
80
 
@@ -94,7 +96,7 @@ class PDFBeads::PageDataProvider < Array
94
96
 
95
97
  $stderr.puts( "Prepared data for processing #{@name}\n" )
96
98
  if insp.nextImage
97
- $stderr.puts( "Warning: #{@name} contains multiple images, but only the first one")
99
+ $stderr.puts( "Warning: #{@name} contains multiple images, but only the first one")
98
100
  $stderr.puts( "\tis going to be used\n" )
99
101
  end
100
102
  ret
@@ -115,8 +117,8 @@ class PDFBeads::PageDataProvider < Array
115
117
  @bg_layer = bgpath unless bgpath.nil?
116
118
 
117
119
  # If updating auxiliary files is requested and the base image is
118
- # either monochrome or indexed with just a few colors (i. e. doesn't
119
- # contain any elements which should be encoded to the background layer),
120
+ # either bitonal or indexed with just a few colors (i. e. doesn't
121
+ # contain any elements which should be placed to the background layer),
120
122
  # then the *.color.* image (if present) takes priority over any existing
121
123
  # *.bg.* and *.fg.* images. So we should regenerate them.
122
124
  if bgpath.nil? or ( force and not @s_type.eql? 'c' )
@@ -137,7 +139,7 @@ class PDFBeads::PageDataProvider < Array
137
139
  @fg_layer = fgpath unless fgpath.nil?
138
140
  end
139
141
 
140
- if $has_hpricot
142
+ if $has_nokogiri and not @pageargs[:pages_per_dict].nil?
141
143
  @hocr_path = Dir.entries('.').detect do |f|
142
144
  /\A#{@basename}.(HOCR|HTML?)/i.match(f)
143
145
  end
@@ -158,19 +160,19 @@ class PDFBeads::PageDataProvider < Array
158
160
 
159
161
  def writeImage( img,path,fmt )
160
162
  begin
161
- img.write( path ) do
163
+ img.write( path ) do |curimg|
162
164
  case fmt
163
165
  when 'JP2'
164
- self.define( 'JP2','mode','real' )
165
- self.define( 'JP2','numrlvls',4 )
166
- self.define( 'JP2','rate',0.015625 )
166
+ curimg.define( 'JP2','mode','real' )
167
+ curimg.define( 'JP2','numrlvls',4 )
168
+ curimg.define( 'JP2','rate',0.015625 )
167
169
  when 'JPG'
168
- self.quality = 50
170
+ curimg.quality = 50
169
171
  else
170
- self.compression = ZipCompression
171
- self.quality = 95
172
+ curimg.compression = ZipCompression
173
+ curimg.quality = 95
172
174
  end
173
- self.format = fmt
175
+ curimg.format = fmt
174
176
  end
175
177
  return true
176
178
  rescue
@@ -190,6 +192,7 @@ class PDFBeads::PageDataProvider < Array
190
192
  px = Pixel.from_color( color )
191
193
  unless color.eql? exc
192
194
  cpath = "#{@basename}.#{color}.tiff"
195
+ created = false
193
196
  if not File.exists? cpath or force
194
197
  bitonal = img.copy
195
198
  # Caution: replacing colors in the colormap currently only works
@@ -202,16 +205,18 @@ class PDFBeads::PageDataProvider < Array
202
205
  bitonal.colormap( j,crepl )
203
206
  end
204
207
  bitonal.compress_colormap!
205
- bitonal.write( cpath ) do
206
- self.format = 'TIFF'
207
- self.define( 'TIFF','rows-per-strip',img.rows )
208
- self.compression = Group4Compression
208
+ bitonal.write( cpath ) do |curimg|
209
+ curimg.format = 'TIFF'
210
+ curimg.define( 'TIFF','rows-per-strip',img.rows )
211
+ curimg.compression = Group4Compression
209
212
  end
210
213
  bitonal.destroy!
214
+ created = true
211
215
  end
212
216
  cmap = Hash[
213
217
  :path => cpath,
214
- :rgb => [px.red.to_f/QuantumRange, px.green.to_f/QuantumRange, px.blue.to_f/QuantumRange]
218
+ :rgb => [px.red.to_f/QuantumRange, px.green.to_f/QuantumRange, px.blue.to_f/QuantumRange],
219
+ :created => created
215
220
  ]
216
221
  @stencils << cmap
217
222
  ret += 1
@@ -225,12 +230,13 @@ class PDFBeads::PageDataProvider < Array
225
230
  binpath = "#{@basename}.black.tiff"
226
231
  if not File.exists? binpath or force
227
232
  im_copy = img.copy; bitonal = im_copy.threshold(QuantumRange/255*treshold); im_copy.destroy!
228
- bitonal.write( binpath ){
229
- self.format = 'TIFF'
230
- self.define( 'TIFF','rows-per-strip',img.rows )
231
- self.compression = Group4Compression
233
+ bitonal.write( binpath ) { |curimg|
234
+ curimg.format = 'TIFF'
235
+ curimg.define( 'TIFF','rows-per-strip',img.rows )
236
+ curimg.compression = Group4Compression
232
237
  }
233
238
  bitonal.destroy!
239
+ map[:created] = true
234
240
  end
235
241
 
236
242
  bgf = @pageargs[:bg_format]
@@ -263,6 +269,7 @@ class PDFBeads::PageDataProvider < Array
263
269
  end
264
270
 
265
271
  writeImage( img,bgpath,bgf )
272
+ @bg_created = true
266
273
  end
267
274
 
268
275
  map[:path] = binpath
@@ -317,7 +324,7 @@ class PDFBeads::PageDataProvider < Array
317
324
  PageData.fixResolution( img )
318
325
  mask.resize!( imw,imh ) if mask.columns != imw or mask.rows != imh
319
326
 
320
- no_fg = img.composite( mask,CenterGravity,CopyOpacityCompositeOp )
327
+ no_fg = img.composite( mask,CenterGravity,CopyAlphaCompositeOp )
321
328
  bg = no_fg.blur_channel( 0,6,AllChannels )
322
329
  bg.alpha( DeactivateAlphaChannel )
323
330
 
@@ -327,7 +334,10 @@ class PDFBeads::PageDataProvider < Array
327
334
  end
328
335
 
329
336
  bgpath = "#{@basename}.bg." << fmt.downcase
330
- @bg_layer = bgpath if writeImage( bg,bgpath,fmt )
337
+ if writeImage( bg,bgpath,fmt )
338
+ @bg_layer = bgpath
339
+ @bg_created = true
340
+ end
331
341
 
332
342
  bg.destroy!
333
343
  no_fg.destroy!
@@ -336,14 +346,14 @@ class PDFBeads::PageDataProvider < Array
336
346
  ksam = mask.negate
337
347
  mask.destroy!
338
348
 
339
- no_bg = img.composite( ksam,CenterGravity,CopyOpacityCompositeOp )
349
+ no_bg = img.composite( ksam,CenterGravity,CopyAlphaCompositeOp )
340
350
  fg = no_bg.clone
341
351
 
342
352
  # Resize the image to a tiny size and then back to the original size
343
353
  # to achieve the desired color diffusion. The idea is inspired by
344
354
  # Anthony Thyssen's http://www.imagemagick.org/Usage/scripts/hole_fill_shepards
345
355
  # script, which is intended just for this purpose (i. e. removing undesired
346
- # areas from the image). However our approach is a bit cruder (but still
356
+ # areas from the image). However our approach is a bit more crude (but still
347
357
  # effective).
348
358
  fg.resize!( width=imw/100,height=imh/100,filter=GaussianFilter )
349
359
  fg.resize!( width=imw,height=imh,filter=GaussianFilter )
@@ -352,7 +362,10 @@ class PDFBeads::PageDataProvider < Array
352
362
  fg.alpha( DeactivateAlphaChannel )
353
363
 
354
364
  fgpath = "#{@basename}.fg." << fmt.downcase
355
- @fg_layer = fgpath if writeImage( fg,fgpath,fmt )
365
+ if writeImage( fg,fgpath,fmt )
366
+ @fg_layer = fgpath
367
+ @fg_created = true
368
+ end
356
369
 
357
370
  fg.destroy!
358
371
  no_bg.destroy!
@@ -390,8 +403,8 @@ class PDFBeads::PageDataProvider < Array
390
403
 
391
404
  # A hack for some Windows versions of RMagick, which throw an error the
392
405
  # first time when Magick.formats is accessed
406
+ retries = 2
393
407
  begin
394
- retries = 2
395
408
  mfmts = Magick.formats
396
409
  rescue
397
410
  retry if (retries -= 1 ) > 0
@@ -458,7 +471,7 @@ class PDFBeads::PageDataProvider < Array
458
471
  end
459
472
 
460
473
  if pidx == per_dict or i == length - 1
461
- # The jbig2 encoder processes a bunch of files at once, producing
474
+ # The jbig2 encoder processes a bunch of files at once, producing
462
475
  # pages which depend from a shared dictionary. Thus we can skip this
463
476
  # stage only if both the dictionary and each of the individual pages
464
477
  # are already found on the disk
@@ -8,7 +8,7 @@
8
8
  # Unlike other PDF creation tools, this utility attempts to implement
9
9
  # the approach typically used for DjVu books. Its key feature is
10
10
  # separating scanned text (typically black, but indexed images with
11
- # a small number of colors are also accepted) from halftone images
11
+ # a small number of colors are also accepted) from halftone images
12
12
  # placed into a background layer.
13
13
  #
14
14
  # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
@@ -38,7 +38,7 @@
38
38
  # <indent>"Title" "Page Number" [0|-|1|+]
39
39
  #
40
40
  # The indent is used to determine the level of this outline item: it may
41
- # consist either of spaces or of tabs, but it is not allowed to
41
+ # consist either of spaces or of tabs, but it is not allowed to
42
42
  # mix both characters in the same file. The title and page number are
43
43
  # separated with an arbitrary number of whitespace characters and are
44
44
  # normally enclosed into double quotes. The third, optional argument
@@ -101,7 +101,11 @@ class PDFBeads::PDFBuilder::PDFTOC < Array
101
101
  title = parts[0].gsub(/\A"/m,"").gsub(/"\Z/m, "")
102
102
  ref = parts[1].gsub(/\A"/m,"").gsub(/"\Z/m, "")
103
103
  begin
104
- title = Iconv.iconv( "utf-16be", "utf-8", title ).first
104
+ if title.respond_to? :encode
105
+ title.encode!( "utf-16be", "utf-8" )
106
+ else
107
+ title = Iconv.iconv( "utf-16be", "utf-8", title ).first
108
+ end
105
109
  rescue
106
110
  $stderr.puts("Error: TOC should be specified in utf-8")
107
111
  return
data/lib/pdfbeads.rb CHANGED
@@ -8,7 +8,7 @@
8
8
  # Unlike other PDF creation tools, this utility attempts to implement
9
9
  # the approach typically used for DjVu books. Its key feature is
10
10
  # separating scanned text (typically black, but indexed images with
11
- # a small number of colors are also accepted) from halftone images
11
+ # a small number of colors are also accepted) from halftone images
12
12
  # placed into a background layer.
13
13
  #
14
14
  # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
@@ -30,19 +30,25 @@
30
30
  #
31
31
  #######################################################################
32
32
 
33
- require 'iconv'
34
33
  require 'zlib'
35
34
 
36
- require 'RMagick'
35
+ require 'rmagick'
37
36
  include Magick
38
37
 
39
38
  begin
40
- require 'hpricot'
41
- $has_hpricot = true
39
+ require 'nokogiri'
40
+ $has_nokogiri = true
42
41
  rescue LoadError
43
- $stderr.puts( "Warning: the hpricot extension is not available. I'll not be able" )
42
+ $stderr.puts( "Warning: the nokogiri extension is not available. I'll not be able" )
44
43
  $stderr.puts( "\tto create hidden text layer from hOCR files." )
45
- $has_hpricot = false
44
+ $has_nokogiri = false
45
+ end
46
+
47
+ begin
48
+ require 'pdf/reader'
49
+ $has_pdfreader = true
50
+ rescue LoadError
51
+ $has_pdfreader = false
46
52
  end
47
53
 
48
54
  unless ''.respond_to? :ord
@@ -50,6 +56,11 @@ unless ''.respond_to? :ord
50
56
  require 'jcode'
51
57
  end
52
58
 
59
+ # Require iconv for Ruby version less than 1.9.3
60
+ unless ''.respond_to? :encode
61
+ require 'iconv'
62
+ end
63
+
53
64
  class String
54
65
  # Protect strings which are supposed be treated as a raw sequence of bytes.
55
66
  # This is important for Ruby 1.9. For earlier versions the method just
metadata CHANGED
@@ -1,82 +1,113 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: pdfbeads
3
- version: !ruby/object:Gem::Version
4
- hash: 25
5
- prerelease:
6
- segments:
7
- - 1
8
- - 0
9
- - 7
10
- version: 1.0.7
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.1.3
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Alexey Kryukov
14
- autorequire:
8
+ autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2012-02-10 00:00:00 +04:00
19
- default_executable: pdfbeads
20
- dependencies: []
21
-
22
- description: " PDFBeads is a small utility written in Ruby which takes scanned\n page images and converts them into a single PDF file. Unlike other\n PDF creation tools, PDFBeads attempts to implement the approach\n typically used for DjVu books. Its key feature is separating scanned\n text (typically black, but indexed images with a small number of\n colors are also accepted) from halftone pictures. Each type of\n graphical data is encoded into its own layer with a specific\n compression method and resolution.\n"
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rmagick
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 3.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 3.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 1.5.10
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.5.10
41
+ - !ruby/object:Gem::Dependency
42
+ name: pdf-reader
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: 1.0.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 1.0.0
55
+ description: |2
56
+ PDFBeads is a small utility written in Ruby which takes scanned
57
+ page images and converts them into a single PDF file. Unlike other
58
+ PDF creation tools, PDFBeads attempts to implement the approach
59
+ typically used for DjVu books. Its key feature is separating scanned
60
+ text (typically black, but indexed images with a small number of
61
+ colors are also accepted) from halftone pictures. Each type of
62
+ graphical data is encoded into its own layer with a specific
63
+ compression method and resolution.
23
64
  email: amkryukov@gmail.com
24
- executables:
65
+ executables:
25
66
  - pdfbeads
26
67
  extensions: []
27
-
28
- extra_rdoc_files:
68
+ extra_rdoc_files:
29
69
  - README
30
70
  - COPYING
31
71
  - ChangeLog
32
- files:
72
+ files:
73
+ - COPYING
74
+ - ChangeLog
75
+ - README
76
+ - bin/pdfbeads
77
+ - doc/pdfbeads.en.html
78
+ - doc/pdfbeads.ru.html
79
+ - lib/imageinspector.rb
80
+ - lib/pdfbeads.rb
33
81
  - lib/pdfbeads/pdfbuilder.rb
34
- - lib/pdfbeads/pdfpage.rb
35
- - lib/pdfbeads/pdftoc.rb
82
+ - lib/pdfbeads/pdfdoc.rb
36
83
  - lib/pdfbeads/pdffont.rb
37
84
  - lib/pdfbeads/pdflabels.rb
38
- - lib/pdfbeads/pdfdoc.rb
39
- - lib/pdfbeads.rb
40
- - lib/imageinspector.rb
41
- - bin/pdfbeads
42
- - doc/pdfbeads.ru.html
43
- - README
44
- - COPYING
45
- - ChangeLog
46
- has_rdoc: true
85
+ - lib/pdfbeads/pdfpage.rb
86
+ - lib/pdfbeads/pdftoc.rb
47
87
  homepage: http://pdfbeads.rubyforge.org
48
- licenses: []
49
-
50
- post_install_message:
88
+ licenses:
89
+ - GPL-3.0+
90
+ metadata: {}
91
+ post_install_message:
51
92
  rdoc_options: []
52
-
53
- require_paths:
93
+ require_paths:
54
94
  - lib
55
- required_ruby_version: !ruby/object:Gem::Requirement
56
- none: false
57
- requirements:
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
58
97
  - - ">="
59
- - !ruby/object:Gem::Version
60
- hash: 3
61
- segments:
62
- - 0
63
- version: "0"
64
- required_rubygems_version: !ruby/object:Gem::Requirement
65
- none: false
66
- requirements:
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
67
102
  - - ">="
68
- - !ruby/object:Gem::Version
69
- hash: 3
70
- segments:
71
- - 0
72
- version: "0"
73
- requirements:
74
- - RMagick, v2.13.0 or greater
75
- - Hpricot, v0.8.3 or greater
76
- rubyforge_project: PDFBeads
77
- rubygems_version: 1.5.0
78
- signing_key:
79
- specification_version: 3
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements:
106
+ - RMagick, v3.2.0 or greater
107
+ - nokogiri, v1.5.10 or greater
108
+ - PDF::Reader, v1.0.0 or greater
109
+ rubygems_version: 3.2.29
110
+ signing_key:
111
+ specification_version: 4
80
112
  summary: PDFBeads -- convert scanned images to a single PDF file.
81
113
  test_files: []
82
-