pdfbeads 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +339 -0
- data/ChangeLog +3 -0
- data/README +53 -0
- data/bin/pdfbeads +189 -0
- data/doc/pdfbeads.ru.html +509 -0
- data/lib/imageinspector.rb +503 -0
- data/lib/pdfbeads.rb +93 -0
- data/lib/pdfbeads/pdfbuilder.rb +699 -0
- data/lib/pdfbeads/pdfdoc.rb +149 -0
- data/lib/pdfbeads/pdffont.rb +533 -0
- data/lib/pdfbeads/pdflabels.rb +139 -0
- data/lib/pdfbeads/pdfpage.rb +466 -0
- data/lib/pdfbeads/pdftoc.rb +160 -0
- metadata +82 -0
data/lib/pdfbeads.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
######################################################################
|
4
|
+
#
|
5
|
+
# PDFBeads -- convert scanned images to a single PDF file
|
6
|
+
# Version 1.0
|
7
|
+
#
|
8
|
+
# Unlike other PDF creation tools, this utility attempts to implement
|
9
|
+
# the approach typically used for DjVu books. Its key feature is
|
10
|
+
# separating scanned text (typically black, but indexed images with
|
11
|
+
# a small number of colors are also accepted) from halftone images
|
12
|
+
# placed into a background layer.
|
13
|
+
#
|
14
|
+
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
15
|
+
# All rights reserved.
|
16
|
+
#
|
17
|
+
# This program is free software; you can redistribute it and/or modify
|
18
|
+
# it under the terms of the GNU General Public License as published by
|
19
|
+
# the Free Software Foundation; either version 2 of the License, or
|
20
|
+
# (at your option) any later version.
|
21
|
+
#
|
22
|
+
# This program is distributed in the hope that it will be useful,
|
23
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
24
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
25
|
+
# GNU General Public License for more details.
|
26
|
+
#
|
27
|
+
# You should have received a copy of the GNU General Public License
|
28
|
+
# along with this program; if not, write to the Free Software
|
29
|
+
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
30
|
+
#
|
31
|
+
#######################################################################
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
require 'zlib'
|
35
|
+
|
36
|
+
require 'RMagick'
|
37
|
+
include Magick
|
38
|
+
|
39
|
+
begin
|
40
|
+
require 'hpricot'
|
41
|
+
$has_hpricot = true
|
42
|
+
rescue LoadError
|
43
|
+
$stderr.puts( "Warning: the hpricot extension is not available." )
|
44
|
+
$stderr.puts( " pdfbeads will not be able to read OCR data from hOCR files." )
|
45
|
+
$has_hpricot = false
|
46
|
+
end
|
47
|
+
|
48
|
+
unless ''.respond_to? :ord
|
49
|
+
$KCODE = 'u'
|
50
|
+
require 'jcode'
|
51
|
+
end
|
52
|
+
|
53
|
+
class String
|
54
|
+
# Protect strings which are supposed be treated as a raw sequence of bytes.
|
55
|
+
# This is important for Ruby 1.9. For earlier versions the method just
|
56
|
+
# does nothing.
|
57
|
+
unless self.method_defined? :to_binary
|
58
|
+
def to_binary()
|
59
|
+
force_encoding 'ASCII-8BIT' if respond_to? :force_encoding
|
60
|
+
return self
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# In ruby 1.9 sometimes we have to mark a string as UTF-8 encoded
|
65
|
+
# even if we certainly know it is not.
|
66
|
+
unless self.method_defined? :to_text
|
67
|
+
def to_text()
|
68
|
+
force_encoding 'UTF-8' if respond_to? :force_encoding
|
69
|
+
return self
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get a Unicode ordinal for an encoded character (there is no standard method
|
74
|
+
# in Ruby < 1.9 to do that)
|
75
|
+
unless self.method_defined? :ord
|
76
|
+
def ord()
|
77
|
+
begin
|
78
|
+
return Iconv.iconv( 'utf-16be','utf-8',self ).first.unpack('n')[0]
|
79
|
+
rescue
|
80
|
+
return 0x3F # Question mark
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
require 'imageinspector'
|
87
|
+
|
88
|
+
module PDFBeads
|
89
|
+
VERSION = '1.0'
|
90
|
+
require 'pdfbeads/pdfbuilder'
|
91
|
+
require 'pdfbeads/pdfpage'
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,699 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
4
|
+
######################################################################
|
5
|
+
#
|
6
|
+
# PDFBeads -- convert scanned images to a single PDF file
|
7
|
+
# Version 1.0
|
8
|
+
#
|
9
|
+
# Unlike other PDF creation tools, this utility attempts to implement
|
10
|
+
# the approach typically used for DjVu books. Its key feature is
|
11
|
+
# separating scanned text (typically black, but indexed images with
|
12
|
+
# a small number of colors are also accepted) from halftone images
|
13
|
+
# placed into a background layer.
|
14
|
+
#
|
15
|
+
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
16
|
+
# All rights reserved.
|
17
|
+
#
|
18
|
+
# This program is free software; you can redistribute it and/or modify
|
19
|
+
# it under the terms of the GNU General Public License as published by
|
20
|
+
# the Free Software Foundation; either version 2 of the License, or
|
21
|
+
# (at your option) any later version.
|
22
|
+
#
|
23
|
+
# This program is distributed in the hope that it will be useful,
|
24
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
25
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
26
|
+
# GNU General Public License for more details.
|
27
|
+
#
|
28
|
+
# You should have received a copy of the GNU General Public License
|
29
|
+
# along with this program; if not, write to the Free Software
|
30
|
+
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
31
|
+
#
|
32
|
+
#######################################################################
|
33
|
+
|
34
|
+
require 'time'
|
35
|
+
require 'stringio'
|
36
|
+
|
37
|
+
# The key class where the actual generation of a PDF file is performed.
|
38
|
+
class PDFBeads::PDFBuilder
|
39
|
+
require 'pdfbeads/pdfdoc'
|
40
|
+
require 'pdfbeads/pdffont'
|
41
|
+
require 'pdfbeads/pdflabels'
|
42
|
+
require 'pdfbeads/pdftoc'
|
43
|
+
|
44
|
+
@@cmodes = Hash[
|
45
|
+
'BilevelType' => '/DeviceGray',
|
46
|
+
'GrayscaleType' => '/DeviceGray',
|
47
|
+
'PaletteType' => '/Indexed',
|
48
|
+
'PaletteMatteType' => '/Indexed',
|
49
|
+
'TrueColorType' => '/DeviceRGB',
|
50
|
+
'TrueColorMatteType' => '/DeviceRGB',
|
51
|
+
'ColorSeparationType' => '/DeviceCMYK',
|
52
|
+
'ColorSeparationMatteType' => '/DeviceCMYK',
|
53
|
+
'PaletteBilevelMatteType' => '/DeviceGray'
|
54
|
+
]
|
55
|
+
@@cmodes.default = '/DeviceRGB'
|
56
|
+
|
57
|
+
def initialize( pdfargs )
|
58
|
+
@pdfargs = pdfargs
|
59
|
+
@now = Time.now()
|
60
|
+
@doc = Doc.new()
|
61
|
+
@fdata = FontDataProvider.new()
|
62
|
+
|
63
|
+
@dictpath = ''
|
64
|
+
@dictobj = nil
|
65
|
+
end
|
66
|
+
|
67
|
+
def process( pagefiles,st_format )
|
68
|
+
labels = toc = nil
|
69
|
+
labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
|
70
|
+
toc = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
|
71
|
+
meta = parseMeta( @pdfargs[:meta] )
|
72
|
+
|
73
|
+
cat = XObj.new(Hash[
|
74
|
+
'Type' => '/Catalog',
|
75
|
+
'PageLayout' => "/#{@pdfargs[:pagelayout]}"
|
76
|
+
])
|
77
|
+
@doc.addObject(cat)
|
78
|
+
|
79
|
+
offsign = 'Z'
|
80
|
+
if @now.gmt_offset > 0
|
81
|
+
offsign = "+"
|
82
|
+
else
|
83
|
+
offsign = "-"
|
84
|
+
end
|
85
|
+
creationDate = sprintf( "D:%04d%02d%02d%02d%02d%02d%s",
|
86
|
+
@now.year, @now.month, @now.day, @now.hour, @now.min, @now.sec, offsign )
|
87
|
+
unless offsign.eql? 'Z'
|
88
|
+
gmt_mins = @now.gmt_offset/60
|
89
|
+
creationDate << sprintf( "%02d'%02d", gmt_mins/60, gmt_mins%60 )
|
90
|
+
end
|
91
|
+
info = XObj.new(Hash[
|
92
|
+
'Creator' => "(PDFBeads)",
|
93
|
+
'Producer' => "(PDFBeads)",
|
94
|
+
'CreationDate' => "(#{creationDate})"
|
95
|
+
])
|
96
|
+
@doc.addObject(info)
|
97
|
+
meta.each_key do |key|
|
98
|
+
info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
|
99
|
+
end
|
100
|
+
|
101
|
+
out = XObj.new(Hash[
|
102
|
+
'Type' => '/Outlines',
|
103
|
+
'Count' => 0
|
104
|
+
])
|
105
|
+
@doc.addObject(out)
|
106
|
+
cat.addToDict('Outlines', ref(out.getID))
|
107
|
+
|
108
|
+
pages = XObj.new(Hash[
|
109
|
+
'Type' => '/Pages'
|
110
|
+
])
|
111
|
+
@doc.addObject(pages)
|
112
|
+
cat.addToDict('Pages', ref(pages.getID))
|
113
|
+
|
114
|
+
creator = XObj.new(Hash[
|
115
|
+
'Subtype' => '/Artwork',
|
116
|
+
'Creator' => "(PDFBeads)",
|
117
|
+
'Feature' => '(Layers)'
|
118
|
+
])
|
119
|
+
@doc.addObject(creator)
|
120
|
+
|
121
|
+
ocFore = XObj.new(Hash[
|
122
|
+
'Type' => '/OCG',
|
123
|
+
'Name' => '(Foreground)',
|
124
|
+
'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
|
125
|
+
'Intent' => '[/View/Design]'
|
126
|
+
])
|
127
|
+
@doc.addObject(ocFore)
|
128
|
+
ocBack = XObj.new({
|
129
|
+
'Type' => '/OCG',
|
130
|
+
'Name' => '(Background)',
|
131
|
+
'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
|
132
|
+
'Intent' => '[/View/Design]'
|
133
|
+
})
|
134
|
+
@doc.addObject(ocBack)
|
135
|
+
cat.addToDict('OCProperties',
|
136
|
+
sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState (ON) /Order[%s %s] >>>>",
|
137
|
+
ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))
|
138
|
+
|
139
|
+
page_objs = Array.new()
|
140
|
+
pages_by_num = Hash.new()
|
141
|
+
symd = nil
|
142
|
+
font = nil
|
143
|
+
pidx = 0
|
144
|
+
|
145
|
+
if labels != nil and labels.length > 0
|
146
|
+
nTree = "<</Nums[\n"
|
147
|
+
labels.each do |rng|
|
148
|
+
ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first.to_text
|
149
|
+
|
150
|
+
nTree << "#{rng[:first]} << "
|
151
|
+
nTree << "/P (\xFE\xFF#{ltitl.to_text}) " if rng.has_key? :prefix
|
152
|
+
nTree << "/S /#{rng[:style]} " if rng.has_key? :style
|
153
|
+
nTree << "/St #{rng[:start]}" if rng.has_key? :start
|
154
|
+
nTree << ">>\n"
|
155
|
+
end
|
156
|
+
|
157
|
+
nTree << "]\n>>"
|
158
|
+
cat.addToDict('PageLabels', nTree)
|
159
|
+
cur_range_id = 0
|
160
|
+
end
|
161
|
+
|
162
|
+
needs_font = false
|
163
|
+
fonts = encodings = nil
|
164
|
+
pagefiles.each do |p|
|
165
|
+
unless p.hocr_path.nil?
|
166
|
+
needs_font = true
|
167
|
+
break
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
if needs_font
|
172
|
+
fonts = Array.new()
|
173
|
+
encodings = [ [' '] ]
|
174
|
+
fdict = XObj.new( Hash[] )
|
175
|
+
@doc.addObject( fdict )
|
176
|
+
|
177
|
+
descr = XObj.new( Hash[
|
178
|
+
'Type' => '/FontDescriptor',
|
179
|
+
'BaseFont' => '/Times-Roman',
|
180
|
+
] )
|
181
|
+
@fdata.header.each_key do |key|
|
182
|
+
descr.addToDict( key,@fdata.header[key] )
|
183
|
+
end
|
184
|
+
@doc.addObject( descr )
|
185
|
+
end
|
186
|
+
|
187
|
+
pagefiles.each do |p|
|
188
|
+
procSet = ['/PDF', '/ImageB']
|
189
|
+
c_str = ''
|
190
|
+
doc_objs = Array.new()
|
191
|
+
lastimg = 0
|
192
|
+
|
193
|
+
width = p.width; height = p.height
|
194
|
+
xres = p.x_res; yres = p.y_res
|
195
|
+
pwidth = width.to_f / xres * 72
|
196
|
+
pheight = height.to_f / yres * 72
|
197
|
+
|
198
|
+
p.stencils.each do |s|
|
199
|
+
if st_format.eql? 'JBIG2'
|
200
|
+
xobj,width,height,xres,yres = loadJBIG2Page( s[:jbig2path],s[:jbig2dict],ref(ocFore.getID) )
|
201
|
+
else
|
202
|
+
xobj,width,height,xres,yres = loadCCITTPage( s[:path],ref(ocFore.getID) )
|
203
|
+
end
|
204
|
+
break if xobj.nil?
|
205
|
+
|
206
|
+
color = s[:rgb].join(' ') << ' rg'
|
207
|
+
doc_objs << xobj
|
208
|
+
|
209
|
+
c_str << "#{color} /Im#{lastimg} Do "
|
210
|
+
lastimg += 1
|
211
|
+
end
|
212
|
+
|
213
|
+
fg_image = bg_image = nil
|
214
|
+
fg_image = loadImage( p.fg_layer,ocFore.getID,procSet ) unless p.fg_layer.nil?
|
215
|
+
bg_image = loadImage( p.bg_layer,ocBack.getID,procSet ) unless p.bg_layer.nil?
|
216
|
+
|
217
|
+
contents = XObj.new(Hash[
|
218
|
+
'Filter' => '/FlateDecode'
|
219
|
+
])
|
220
|
+
resobj = XObj.new(Hash.new())
|
221
|
+
resources = XObj.new(Hash[
|
222
|
+
'XObject' => ref(resobj.getID)
|
223
|
+
])
|
224
|
+
|
225
|
+
unless fg_image.nil?
|
226
|
+
xobj = doc_objs[0]
|
227
|
+
fg_image.addToDict('SMask', ref(xobj.getID))
|
228
|
+
xobj.removeFromDict('ImageMask')
|
229
|
+
xobj.addToDict('Decode', '[1 0]')
|
230
|
+
resobj.addToDict('Im0', ref(fg_image.getID))
|
231
|
+
doc_objs << fg_image
|
232
|
+
c_str = '/Im0 Do '
|
233
|
+
else
|
234
|
+
doc_objs.each_index do |i|
|
235
|
+
resobj.addToDict( "Im#{i}", ref(doc_objs[i].getID) )
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
unless bg_image.nil?
|
240
|
+
c_str = "/Im#{resobj.dictLength} Do " << c_str
|
241
|
+
resobj.addToDict( "Im#{resobj.dictLength}", ref(bg_image.getID) )
|
242
|
+
doc_objs << bg_image
|
243
|
+
end
|
244
|
+
c_str = sprintf( "q %.2f 0 0 %.2f 0 0 cm %sQ",pwidth,pheight,c_str )
|
245
|
+
|
246
|
+
doc_objs.concat( [contents, resobj, resources] )
|
247
|
+
|
248
|
+
hocr = nil
|
249
|
+
unless p.hocr_path.nil?
|
250
|
+
hocr = open( p.hocr_path ) { |f| Hpricot.parse( f ) }
|
251
|
+
procSet << '/Text'
|
252
|
+
c_str << getPDFText( hocr,pheight,72.0/xres,72.0/yres,encodings )
|
253
|
+
end
|
254
|
+
|
255
|
+
contents.reinit( Hash[
|
256
|
+
'Filter' => '/FlateDecode'
|
257
|
+
], Zlib::Deflate.deflate( c_str,9 ) )
|
258
|
+
resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
|
259
|
+
resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil?
|
260
|
+
|
261
|
+
page = XObj.new(Hash[
|
262
|
+
'Type' => '/Page',
|
263
|
+
'Parent' => "#{pages.getID} 0 R",
|
264
|
+
'MediaBox' => sprintf( "[ 0 0 %.02f %.02f ]",pwidth,pheight ),
|
265
|
+
'Contents' => ref( contents.getID ),
|
266
|
+
'Resources' => ref( resources.getID )
|
267
|
+
])
|
268
|
+
# By default acroread uses /DeviceCMYK as a transparency blending space,
|
269
|
+
# so adding an SMask image to a page would result to colors being shifted,
|
270
|
+
# uless we take a special care of this. For more details see
|
271
|
+
# http://comments.gmane.org/gmane.comp.tex.pdftex/3747
|
272
|
+
unless fg_image.nil?
|
273
|
+
cspace = '/DeviceRGB'
|
274
|
+
cspace = fg_image.getFromDict( 'ColorSpace' ) if fg_image.hasInDict( 'ColorSpace' )
|
275
|
+
page.addToDict( 'Group', "<< /S /Transparency /CS #{cspace} >>" )
|
276
|
+
end
|
277
|
+
doc_objs << page
|
278
|
+
doc_objs.each{ |x| @doc.addObject(x) }
|
279
|
+
page_objs << page
|
280
|
+
|
281
|
+
pages.addToDict( 'Count', page_objs.length )
|
282
|
+
pages.addToDict( 'Kids', '[' << page_objs.map{|x| ref(x.getID).to_s}.join(' ') << ']' )
|
283
|
+
|
284
|
+
pkey = pidx + 1
|
285
|
+
pkey = labels.getPageLabel( cur_range_id,pidx ) if labels != nil and labels.length > 0
|
286
|
+
pages_by_num[pkey] = page.getID
|
287
|
+
pidx += 1
|
288
|
+
if labels != nil and labels.length > 0
|
289
|
+
if cur_range_id < labels.length - 1 and labels[cur_range_id + 1][:first] == pidx
|
290
|
+
cur_range_id += 1
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
$stderr.puts("Processed #{p.name}\n")
|
295
|
+
$stderr.puts(" Added background image from #{p.bg_layer}\n") unless bg_image.nil?
|
296
|
+
$stderr.puts(" Added foreground image from #{p.fg_layer}\n") unless fg_image.nil?
|
297
|
+
end
|
298
|
+
|
299
|
+
if needs_font
|
300
|
+
fidx = 1
|
301
|
+
encodings.each do |enc|
|
302
|
+
font = addFont( descr,enc,"Fnt#{fidx}" )
|
303
|
+
fdict.addToDict( "Fnt#{fidx}",ref(font.getID) )
|
304
|
+
fonts << font
|
305
|
+
fidx += 1
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
if toc != nil and toc.length > 0
|
310
|
+
getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
|
311
|
+
cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
|
312
|
+
cat.addToDict('PageMode', "/UseOutlines")
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# Output the created PDF file to the disk.
|
317
|
+
def output( outpath )
|
318
|
+
begin
|
319
|
+
if outpath.eql? 'STDOUT'
|
320
|
+
out = $stdout
|
321
|
+
else
|
322
|
+
out = File.open( outpath,'w' )
|
323
|
+
end
|
324
|
+
|
325
|
+
out.binmode if /(win|w)32$/.match( RUBY_PLATFORM )
|
326
|
+
out.write( @doc.to_s )
|
327
|
+
out.close unless outpath.eql? 'STDOUT'
|
328
|
+
rescue
|
329
|
+
$stderr.puts( "Error: could not write to #{outpath}" )
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
private
|
334
|
+
|
335
|
+
def parseMeta( path )
|
336
|
+
ret = Hash.new()
|
337
|
+
return ret if path.nil? or path.eql? ''
|
338
|
+
|
339
|
+
keys = [ 'Title', 'Author', 'Subject', 'Keywords' ]
|
340
|
+
File.open( path,'r' ) do |fin|
|
341
|
+
fin.set_encoding 'UTF-8' if fin.respond_to? :set_encoding
|
342
|
+
fin.each do |fl|
|
343
|
+
next if /^\#/.match( fl )
|
344
|
+
|
345
|
+
if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
|
346
|
+
key = $1
|
347
|
+
if keys.include? key
|
348
|
+
begin
|
349
|
+
ret[key] = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
|
350
|
+
rescue
|
351
|
+
$stderr.puts("Error: metadata should be specified in utf-8")
|
352
|
+
end
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
ret
|
358
|
+
end
|
359
|
+
|
360
|
+
def getOutlineObjs( toc,page_ids,fp_id )
|
361
|
+
root = toc[0]
|
362
|
+
root[:pdfobj] = XObj.new( Hash[
|
363
|
+
'Type' => '/Outlines',
|
364
|
+
'Count' => root.getChildrenCount
|
365
|
+
])
|
366
|
+
@doc.addObject(root[:pdfobj])
|
367
|
+
|
368
|
+
toc[1..-1].each do |item|
|
369
|
+
dest = fp_id
|
370
|
+
if page_ids.has_key? item[:ref]
|
371
|
+
dest = page_ids[item[:ref]]
|
372
|
+
else
|
373
|
+
dest = nil
|
374
|
+
$stderr.puts("Malformed TOC: there is no page #{item[:ref]} in this document.")
|
375
|
+
end
|
376
|
+
|
377
|
+
item_text = item[:title].to_binary
|
378
|
+
item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
|
379
|
+
item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
|
380
|
+
item[:pdfobj] = XObj.new(Hash[
|
381
|
+
'Title' => "(\xFE\xFF#{item_text.to_text})",
|
382
|
+
'Parent' => ref(item[:parent][:pdfobj].getID),
|
383
|
+
])
|
384
|
+
if dest != nil
|
385
|
+
item[:pdfobj].addToDict('Dest', "[ #{dest} 0 R /XYZ null null null ]")
|
386
|
+
else
|
387
|
+
item[:pdfobj].addToDict('C', "[0.75 0.75 0.75]")
|
388
|
+
end
|
389
|
+
|
390
|
+
|
391
|
+
if item[:children].length > 0
|
392
|
+
cnt = item.getChildrenCount
|
393
|
+
if item[:open]
|
394
|
+
item[:pdfobj].addToDict('Count', cnt)
|
395
|
+
else
|
396
|
+
item[:pdfobj].addToDict('Count', -cnt)
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
unless item.has_key? :prev
|
401
|
+
item[:parent][:pdfobj].addToDict('First', ref(item[:pdfobj].getID))
|
402
|
+
else
|
403
|
+
item[:prev][:pdfobj].addToDict('Next', ref(item[:pdfobj].getID))
|
404
|
+
item[:pdfobj].addToDict('Prev', ref(item[:prev][:pdfobj].getID))
|
405
|
+
end
|
406
|
+
|
407
|
+
unless item.has_key? :next
|
408
|
+
item[:parent][:pdfobj].addToDict('Last', ref(item[:pdfobj].getID))
|
409
|
+
end
|
410
|
+
|
411
|
+
@doc.addObject(item[:pdfobj])
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
# Returns an array containing the coordinates of the bounding box around
|
416
|
+
# an element
|
417
|
+
def elementCoordinates( element,xscale,yscale )
|
418
|
+
out = [0,0,0,0]
|
419
|
+
|
420
|
+
if element.attributes.to_hash.has_key? 'title'
|
421
|
+
if /bbox((\s+\d+){4})/.match(element.attributes.to_hash['title'])
|
422
|
+
coords = $1.strip.split(/\s+/)
|
423
|
+
out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
|
424
|
+
(coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
|
425
|
+
end
|
426
|
+
end
|
427
|
+
return out
|
428
|
+
end
|
429
|
+
|
430
|
+
def getPDFText( hocr,pheight,xscale,yscale,encodings )
|
431
|
+
fsize = 10
|
432
|
+
cur_enc = encodings[0]
|
433
|
+
ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
|
434
|
+
|
435
|
+
charset = 'utf-8'
|
436
|
+
hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
|
437
|
+
attrs = el.attributes.to_hash
|
438
|
+
charset = $1 if attrs.has_key? 'content' and
|
439
|
+
/\Atext\/html;charset=([A-Za-z0-9-]+)\Z/i.match( attrs['content'] )
|
440
|
+
end
|
441
|
+
|
442
|
+
hocr.search("//span[@class='ocr_line']").each do |line|
|
443
|
+
txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
|
444
|
+
begin
|
445
|
+
txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
|
446
|
+
rescue
|
447
|
+
txt = ''
|
448
|
+
end
|
449
|
+
next if txt.eql? ''
|
450
|
+
txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
|
451
|
+
txt.sub!( /-\Z/, "\xC2\xAD" )
|
452
|
+
|
453
|
+
bbox = elementCoordinates( line,xscale,yscale )
|
454
|
+
ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
|
455
|
+
ret << sprintf( "%f %f %f %f %f %f Tm ",
|
456
|
+
ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
|
457
|
+
|
458
|
+
txt8 = ''
|
459
|
+
txt.each_char do |char|
|
460
|
+
begin
|
461
|
+
Iconv.iconv( "utf-16be","utf-8",char )
|
462
|
+
rescue
|
463
|
+
rawbytes = char.unpack( 'C*' )
|
464
|
+
bs = ''
|
465
|
+
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
466
|
+
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
467
|
+
char = '?' * rawbytes.length
|
468
|
+
end
|
469
|
+
|
470
|
+
encoded = false
|
471
|
+
unless cur_enc.include? char
|
472
|
+
encodings.each_index do |i|
|
473
|
+
enc = encodings[i]
|
474
|
+
next if enc == cur_enc
|
475
|
+
|
476
|
+
if enc.include? char
|
477
|
+
ret << "<#{txt8}> Tj "
|
478
|
+
cur_enc = enc
|
479
|
+
ret << "/Fnt#{i + 1} #{fsize} Tf "
|
480
|
+
txt8 = ''
|
481
|
+
encoded = true
|
482
|
+
break
|
483
|
+
end
|
484
|
+
end
|
485
|
+
|
486
|
+
unless encoded
|
487
|
+
last = encodings[-1]
|
488
|
+
if last.length < 256
|
489
|
+
last << char
|
490
|
+
else
|
491
|
+
last = [ ' ',char ]
|
492
|
+
encodings << last
|
493
|
+
end
|
494
|
+
|
495
|
+
if cur_enc != last
|
496
|
+
ret << "<#{txt8}> Tj "
|
497
|
+
cur_enc = last
|
498
|
+
ret << "/Fnt#{encodings.length} #{fsize} Tf "
|
499
|
+
txt8 = ''
|
500
|
+
end
|
501
|
+
end
|
502
|
+
end
|
503
|
+
|
504
|
+
txt8 << sprintf( "%02X",cur_enc.index(char) )
|
505
|
+
end
|
506
|
+
|
507
|
+
ret << "<#{txt8}> Tj " unless txt8.eql? ''
|
508
|
+
end
|
509
|
+
|
510
|
+
ret << "ET "
|
511
|
+
return ret
|
512
|
+
end
|
513
|
+
|
514
|
+
def addFont( descr,fenc,fname )
|
515
|
+
enc_str = @fdata.getEncoding( fenc ).join( ' ' )
|
516
|
+
enc = XObj.new( Hash[
|
517
|
+
'Type' => "/Encoding",
|
518
|
+
'Differences' => "[ 0 #{enc_str} ]"
|
519
|
+
])
|
520
|
+
@doc.addObject( enc )
|
521
|
+
|
522
|
+
toUni = @fdata.getCMAP( fenc )
|
523
|
+
@doc.addObject( toUni )
|
524
|
+
|
525
|
+
font = XObj.new( Hash[
|
526
|
+
'BaseFont' => '/Times-Roman',
|
527
|
+
'Name' => "/#{fname}",
|
528
|
+
'Subtype' => '/Type1',
|
529
|
+
'Type' => '/Font',
|
530
|
+
'FirstChar' => 0,
|
531
|
+
'LastChar' => fenc.length - 1,
|
532
|
+
'Widths' => '[ ' << @fdata.getWidths(fenc).map{|w| w.to_s}.join(' ') << ' ]',
|
533
|
+
'FontDescriptor' => ref(descr.getID),
|
534
|
+
'ToUnicode' => ref(toUni.getID),
|
535
|
+
] )
|
536
|
+
if enc.nil?
|
537
|
+
font.addToDict( 'Encoding','/WinAnsiEncoding' )
|
538
|
+
else
|
539
|
+
font.addToDict( 'Encoding',ref(enc.getID) )
|
540
|
+
end
|
541
|
+
@doc.addObject( font )
|
542
|
+
return font
|
543
|
+
end
|
544
|
+
|
545
|
+
def loadCCITTPage( path,ocref )
|
546
|
+
stencil = ImageInspector.new( path )
|
547
|
+
return nil if stencil.width.nil?
|
548
|
+
|
549
|
+
width = stencil.width
|
550
|
+
height = stencil.height
|
551
|
+
xres = stencil.x_dpi
|
552
|
+
yres = stencil.y_dpi
|
553
|
+
rows_per_strip = stencil.tags[0x116][0]
|
554
|
+
|
555
|
+
unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
|
556
|
+
img = ImageList.new( path )
|
557
|
+
imgdata = img.to_blob{
|
558
|
+
self.format = 'TIFF'
|
559
|
+
self.define( 'TIFF','rows-per-strip',height )
|
560
|
+
self.compression = Group4Compression
|
561
|
+
}
|
562
|
+
stencil = ImageInspector.new( StringIO.new(imgdata) )
|
563
|
+
img.destroy!
|
564
|
+
end
|
565
|
+
body = stencil.getRawData
|
566
|
+
|
567
|
+
xobj = XObj.new(Hash[
|
568
|
+
'Type' => '/XObject',
|
569
|
+
'Subtype' => '/Image',
|
570
|
+
'OC' => ocref,
|
571
|
+
'Width' => width.to_s,
|
572
|
+
'Height' => height.to_s,
|
573
|
+
'ImageMask' => 'true',
|
574
|
+
'ColorSpace' => '/DeviceGray',
|
575
|
+
'BitsPerComponent' => '1',
|
576
|
+
'Filter' => '/CCITTFaxDecode',
|
577
|
+
'DecodeParms' => "<< /Columns #{width} /K -1 >>"
|
578
|
+
], body)
|
579
|
+
|
580
|
+
return [ xobj,width,height,xres,yres ]
|
581
|
+
end
|
582
|
+
|
583
|
+
def loadJBIG2Page( path,dictpath,ocref )
|
584
|
+
begin
|
585
|
+
jbig2 = File.open( path,'r' ).read
|
586
|
+
width, height, xres, yres = jbig2[11...27].unpack( 'NNNN' )
|
587
|
+
unless @dictpath.eql? dictpath
|
588
|
+
symd_f = File.open( dictpath,'r' ).read
|
589
|
+
symd_o = @doc.addObject( XObj.new(Hash.new(),symd_f) )
|
590
|
+
@dictpath = dictpath
|
591
|
+
@dictobj = symd_o
|
592
|
+
end
|
593
|
+
rescue
|
594
|
+
$stderr.puts( "Page not completed: could not access #{path}" )
|
595
|
+
return nil
|
596
|
+
end
|
597
|
+
|
598
|
+
xobj = XObj.new(Hash[
|
599
|
+
'Type' => '/XObject',
|
600
|
+
'Subtype' => '/Image',
|
601
|
+
'OC' => ocref,
|
602
|
+
'Width' => width.to_s,
|
603
|
+
'Height' => height.to_s,
|
604
|
+
'ImageMask' => 'true',
|
605
|
+
'ColorSpace' => '/DeviceGray',
|
606
|
+
'BitsPerComponent' => '1',
|
607
|
+
'Filter' => '/JBIG2Decode',
|
608
|
+
'DecodeParms' => "<< /JBIG2Globals #{@dictobj.getID} 0 R >>"
|
609
|
+
], jbig2)
|
610
|
+
|
611
|
+
return [ xobj,width,height,xres,yres ]
|
612
|
+
end
|
613
|
+
|
614
|
+
def loadImage( impath,ocID,procSet )
|
615
|
+
insp = ImageInspector.new( impath )
|
616
|
+
return nil if insp.width.nil?
|
617
|
+
|
618
|
+
# JPEG, JPEG2000 and PNG images can be handled directly. We also can
|
619
|
+
# handle uncompressed TIFF files, although it is very unlikely someone
|
620
|
+
# would use them for page background. Unfortunately things are more
|
621
|
+
# difficult for compressed TIFF images, as they normally contain several
|
622
|
+
# compressed chunks, so that we can't just concatenate them. So for all
|
623
|
+
# other image types we just call ImageMagick to convert them into a
|
624
|
+
# zip-compressed PNG, and then retrieve the raw data from that PNG image.
|
625
|
+
unless [ :JPEG, :JPEG2000, :PNG ].include? insp.format or
|
626
|
+
( insp.format.eql? :TIFF and ( insp.compression.eql? :NoCompression or
|
627
|
+
( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))
|
628
|
+
|
629
|
+
img = ImageList.new( impath )
|
630
|
+
imgdata = img.to_blob{
|
631
|
+
self.format = 'PNG'
|
632
|
+
self.quality = 95
|
633
|
+
self.compression = ZipCompression
|
634
|
+
}
|
635
|
+
insp = ImageInspector.new( StringIO.new(imgdata) )
|
636
|
+
img.destroy!
|
637
|
+
end
|
638
|
+
rawdata = insp.getRawData
|
639
|
+
cspace = "/#{insp.cspace}"
|
640
|
+
fmt = insp.format
|
641
|
+
imgcompr = insp.compression
|
642
|
+
per_comp = 1
|
643
|
+
|
644
|
+
if cspace.eql? '/Indexed' and not insp.palette.nil?
|
645
|
+
cspace = '/DeviceGray'; cpal = insp.palette
|
646
|
+
rgb = false
|
647
|
+
cpal.each do |c|
|
648
|
+
if c[0] != c[1] or c[0] != c[2]
|
649
|
+
cspace = '/DeviceRGB'
|
650
|
+
rgb = true
|
651
|
+
break
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
cspace = "[/Indexed #{cspace} #{cpal.length - 1} < "
|
656
|
+
cpal.each do |c|
|
657
|
+
cspace << sprintf( "%02x ",c[0] )
|
658
|
+
cspace << sprintf( "%02x %02x ",c[1],c[2] ) if rgb
|
659
|
+
end
|
660
|
+
cspace << '>]'
|
661
|
+
|
662
|
+
procSet << '/ImageI' unless procSet.include? '/ImageI'
|
663
|
+
|
664
|
+
elsif not cspace.eql? '/DeviceGray' and not procSet.include? '/ImageC'
|
665
|
+
procSet << '/ImageC'
|
666
|
+
end
|
667
|
+
|
668
|
+
if cspace.eql? '/DeviceRGB'
|
669
|
+
per_comp = 3
|
670
|
+
elsif cspace.eql? '/DeviceCMYK'
|
671
|
+
per_comp = 4
|
672
|
+
end
|
673
|
+
image = XObj.new( Hash[
|
674
|
+
'Type' => '/XObject',
|
675
|
+
'Subtype' => '/Image',
|
676
|
+
'OC' => ref( ocID ),
|
677
|
+
'Width' => insp.width,
|
678
|
+
'Height' => insp.height,
|
679
|
+
'Interpolate' => 'true'
|
680
|
+
], rawdata )
|
681
|
+
|
682
|
+
unless fmt.eql? :JPEG2000
|
683
|
+
image.addToDict( 'BitsPerComponent',insp.depth )
|
684
|
+
image.addToDict( 'ColorSpace',"#{cspace}" )
|
685
|
+
end
|
686
|
+
image.addToDict( 'Filter',"/#{imgcompr}" ) unless insp.compression.eql? :NoCompression
|
687
|
+
if [:PNG, :TIFF].include? fmt
|
688
|
+
predictor = (fmt.eql? :PNG) ? 15 : 2
|
689
|
+
image.addToDict( 'DecodeParms',
|
690
|
+
"<< /Predictor #{predictor} /Colors #{per_comp} /BitsPerComponent #{insp.depth} /Columns #{insp.width} >>" )
|
691
|
+
end
|
692
|
+
return image
|
693
|
+
end
|
694
|
+
|
695
|
+
def ref(x)
|
696
|
+
return "#{x} 0 R"
|
697
|
+
end
|
698
|
+
end
|
699
|
+
|