pdfbeads 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +339 -0
- data/ChangeLog +3 -0
- data/README +53 -0
- data/bin/pdfbeads +189 -0
- data/doc/pdfbeads.ru.html +509 -0
- data/lib/imageinspector.rb +503 -0
- data/lib/pdfbeads.rb +93 -0
- data/lib/pdfbeads/pdfbuilder.rb +699 -0
- data/lib/pdfbeads/pdfdoc.rb +149 -0
- data/lib/pdfbeads/pdffont.rb +533 -0
- data/lib/pdfbeads/pdflabels.rb +139 -0
- data/lib/pdfbeads/pdfpage.rb +466 -0
- data/lib/pdfbeads/pdftoc.rb +160 -0
- metadata +82 -0
data/lib/pdfbeads.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
######################################################################
|
4
|
+
#
|
5
|
+
# PDFBeads -- convert scanned images to a single PDF file
|
6
|
+
# Version 1.0
|
7
|
+
#
|
8
|
+
# Unlike other PDF creation tools, this utility attempts to implement
|
9
|
+
# the approach typically used for DjVu books. Its key feature is
|
10
|
+
# separating scanned text (typically black, but indexed images with
|
11
|
+
# a small number of colors are also accepted) from halftone images
|
12
|
+
# placed into a background layer.
|
13
|
+
#
|
14
|
+
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
15
|
+
# All rights reserved.
|
16
|
+
#
|
17
|
+
# This program is free software; you can redistribute it and/or modify
|
18
|
+
# it under the terms of the GNU General Public License as published by
|
19
|
+
# the Free Software Foundation; either version 2 of the License, or
|
20
|
+
# (at your option) any later version.
|
21
|
+
#
|
22
|
+
# This program is distributed in the hope that it will be useful,
|
23
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
24
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
25
|
+
# GNU General Public License for more details.
|
26
|
+
#
|
27
|
+
# You should have received a copy of the GNU General Public License
|
28
|
+
# along with this program; if not, write to the Free Software
|
29
|
+
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
30
|
+
#
|
31
|
+
#######################################################################
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
require 'zlib'
|
35
|
+
|
36
|
+
require 'RMagick'
|
37
|
+
include Magick
|
38
|
+
|
39
|
+
begin
|
40
|
+
require 'hpricot'
|
41
|
+
$has_hpricot = true
|
42
|
+
rescue LoadError
|
43
|
+
$stderr.puts( "Warning: the hpricot extension is not available." )
|
44
|
+
$stderr.puts( " pdfbeads will not be able to read OCR data from hOCR files." )
|
45
|
+
$has_hpricot = false
|
46
|
+
end
|
47
|
+
|
48
|
+
unless ''.respond_to? :ord
|
49
|
+
$KCODE = 'u'
|
50
|
+
require 'jcode'
|
51
|
+
end
|
52
|
+
|
53
|
+
class String
|
54
|
+
# Protect strings which are supposed be treated as a raw sequence of bytes.
|
55
|
+
# This is important for Ruby 1.9. For earlier versions the method just
|
56
|
+
# does nothing.
|
57
|
+
unless self.method_defined? :to_binary
|
58
|
+
def to_binary()
|
59
|
+
force_encoding 'ASCII-8BIT' if respond_to? :force_encoding
|
60
|
+
return self
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# In ruby 1.9 sometimes we have to mark a string as UTF-8 encoded
|
65
|
+
# even if we certainly know it is not.
|
66
|
+
unless self.method_defined? :to_text
|
67
|
+
def to_text()
|
68
|
+
force_encoding 'UTF-8' if respond_to? :force_encoding
|
69
|
+
return self
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get a Unicode ordinal for an encoded character (there is no standard method
|
74
|
+
# in Ruby < 1.9 to do that)
|
75
|
+
unless self.method_defined? :ord
|
76
|
+
def ord()
|
77
|
+
begin
|
78
|
+
return Iconv.iconv( 'utf-16be','utf-8',self ).first.unpack('n')[0]
|
79
|
+
rescue
|
80
|
+
return 0x3F # Question mark
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
require 'imageinspector'
|
87
|
+
|
88
|
+
module PDFBeads
|
89
|
+
VERSION = '1.0'
|
90
|
+
require 'pdfbeads/pdfbuilder'
|
91
|
+
require 'pdfbeads/pdfpage'
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,699 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
4
|
+
######################################################################
|
5
|
+
#
|
6
|
+
# PDFBeads -- convert scanned images to a single PDF file
|
7
|
+
# Version 1.0
|
8
|
+
#
|
9
|
+
# Unlike other PDF creation tools, this utility attempts to implement
|
10
|
+
# the approach typically used for DjVu books. Its key feature is
|
11
|
+
# separating scanned text (typically black, but indexed images with
|
12
|
+
# a small number of colors are also accepted) from halftone images
|
13
|
+
# placed into a background layer.
|
14
|
+
#
|
15
|
+
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
16
|
+
# All rights reserved.
|
17
|
+
#
|
18
|
+
# This program is free software; you can redistribute it and/or modify
|
19
|
+
# it under the terms of the GNU General Public License as published by
|
20
|
+
# the Free Software Foundation; either version 2 of the License, or
|
21
|
+
# (at your option) any later version.
|
22
|
+
#
|
23
|
+
# This program is distributed in the hope that it will be useful,
|
24
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
25
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
26
|
+
# GNU General Public License for more details.
|
27
|
+
#
|
28
|
+
# You should have received a copy of the GNU General Public License
|
29
|
+
# along with this program; if not, write to the Free Software
|
30
|
+
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
31
|
+
#
|
32
|
+
#######################################################################
|
33
|
+
|
34
|
+
require 'time'
|
35
|
+
require 'stringio'
|
36
|
+
|
37
|
+
# The key class where the actual generation of a PDF file is performed.
|
38
|
+
class PDFBeads::PDFBuilder
|
39
|
+
require 'pdfbeads/pdfdoc'
|
40
|
+
require 'pdfbeads/pdffont'
|
41
|
+
require 'pdfbeads/pdflabels'
|
42
|
+
require 'pdfbeads/pdftoc'
|
43
|
+
|
44
|
+
@@cmodes = Hash[
|
45
|
+
'BilevelType' => '/DeviceGray',
|
46
|
+
'GrayscaleType' => '/DeviceGray',
|
47
|
+
'PaletteType' => '/Indexed',
|
48
|
+
'PaletteMatteType' => '/Indexed',
|
49
|
+
'TrueColorType' => '/DeviceRGB',
|
50
|
+
'TrueColorMatteType' => '/DeviceRGB',
|
51
|
+
'ColorSeparationType' => '/DeviceCMYK',
|
52
|
+
'ColorSeparationMatteType' => '/DeviceCMYK',
|
53
|
+
'PaletteBilevelMatteType' => '/DeviceGray'
|
54
|
+
]
|
55
|
+
@@cmodes.default = '/DeviceRGB'
|
56
|
+
|
57
|
+
def initialize( pdfargs )
|
58
|
+
@pdfargs = pdfargs
|
59
|
+
@now = Time.now()
|
60
|
+
@doc = Doc.new()
|
61
|
+
@fdata = FontDataProvider.new()
|
62
|
+
|
63
|
+
@dictpath = ''
|
64
|
+
@dictobj = nil
|
65
|
+
end
|
66
|
+
|
67
|
+
def process( pagefiles,st_format )
|
68
|
+
labels = toc = nil
|
69
|
+
labels = PDFLabels.new( @pdfargs[:labels] ) unless @pdfargs[:labels].nil?
|
70
|
+
toc = PDFTOC.new( @pdfargs[:toc] ) unless @pdfargs[:toc].nil?
|
71
|
+
meta = parseMeta( @pdfargs[:meta] )
|
72
|
+
|
73
|
+
cat = XObj.new(Hash[
|
74
|
+
'Type' => '/Catalog',
|
75
|
+
'PageLayout' => "/#{@pdfargs[:pagelayout]}"
|
76
|
+
])
|
77
|
+
@doc.addObject(cat)
|
78
|
+
|
79
|
+
offsign = 'Z'
|
80
|
+
if @now.gmt_offset > 0
|
81
|
+
offsign = "+"
|
82
|
+
else
|
83
|
+
offsign = "-"
|
84
|
+
end
|
85
|
+
creationDate = sprintf( "D:%04d%02d%02d%02d%02d%02d%s",
|
86
|
+
@now.year, @now.month, @now.day, @now.hour, @now.min, @now.sec, offsign )
|
87
|
+
unless offsign.eql? 'Z'
|
88
|
+
gmt_mins = @now.gmt_offset/60
|
89
|
+
creationDate << sprintf( "%02d'%02d", gmt_mins/60, gmt_mins%60 )
|
90
|
+
end
|
91
|
+
info = XObj.new(Hash[
|
92
|
+
'Creator' => "(PDFBeads)",
|
93
|
+
'Producer' => "(PDFBeads)",
|
94
|
+
'CreationDate' => "(#{creationDate})"
|
95
|
+
])
|
96
|
+
@doc.addObject(info)
|
97
|
+
meta.each_key do |key|
|
98
|
+
info.addToDict(key, "(\xFE\xFF#{meta[key].to_text})")
|
99
|
+
end
|
100
|
+
|
101
|
+
out = XObj.new(Hash[
|
102
|
+
'Type' => '/Outlines',
|
103
|
+
'Count' => 0
|
104
|
+
])
|
105
|
+
@doc.addObject(out)
|
106
|
+
cat.addToDict('Outlines', ref(out.getID))
|
107
|
+
|
108
|
+
pages = XObj.new(Hash[
|
109
|
+
'Type' => '/Pages'
|
110
|
+
])
|
111
|
+
@doc.addObject(pages)
|
112
|
+
cat.addToDict('Pages', ref(pages.getID))
|
113
|
+
|
114
|
+
creator = XObj.new(Hash[
|
115
|
+
'Subtype' => '/Artwork',
|
116
|
+
'Creator' => "(PDFBeads)",
|
117
|
+
'Feature' => '(Layers)'
|
118
|
+
])
|
119
|
+
@doc.addObject(creator)
|
120
|
+
|
121
|
+
ocFore = XObj.new(Hash[
|
122
|
+
'Type' => '/OCG',
|
123
|
+
'Name' => '(Foreground)',
|
124
|
+
'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
|
125
|
+
'Intent' => '[/View/Design]'
|
126
|
+
])
|
127
|
+
@doc.addObject(ocFore)
|
128
|
+
ocBack = XObj.new({
|
129
|
+
'Type' => '/OCG',
|
130
|
+
'Name' => '(Background)',
|
131
|
+
'Usage' => "<</CreatorInfo #{ref(creator.getID)}>>",
|
132
|
+
'Intent' => '[/View/Design]'
|
133
|
+
})
|
134
|
+
@doc.addObject(ocBack)
|
135
|
+
cat.addToDict('OCProperties',
|
136
|
+
sprintf("<< /OCGs[%s %s] /D<< /Intent /View /BaseState (ON) /Order[%s %s] >>>>",
|
137
|
+
ref(ocFore.getID), ref(ocBack.getID), ref(ocFore.getID), ref(ocBack.getID)))
|
138
|
+
|
139
|
+
page_objs = Array.new()
|
140
|
+
pages_by_num = Hash.new()
|
141
|
+
symd = nil
|
142
|
+
font = nil
|
143
|
+
pidx = 0
|
144
|
+
|
145
|
+
if labels != nil and labels.length > 0
|
146
|
+
nTree = "<</Nums[\n"
|
147
|
+
labels.each do |rng|
|
148
|
+
ltitl = Iconv.iconv( "utf-16be", "utf-8", rng[:prefix] ).first.to_text
|
149
|
+
|
150
|
+
nTree << "#{rng[:first]} << "
|
151
|
+
nTree << "/P (\xFE\xFF#{ltitl.to_text}) " if rng.has_key? :prefix
|
152
|
+
nTree << "/S /#{rng[:style]} " if rng.has_key? :style
|
153
|
+
nTree << "/St #{rng[:start]}" if rng.has_key? :start
|
154
|
+
nTree << ">>\n"
|
155
|
+
end
|
156
|
+
|
157
|
+
nTree << "]\n>>"
|
158
|
+
cat.addToDict('PageLabels', nTree)
|
159
|
+
cur_range_id = 0
|
160
|
+
end
|
161
|
+
|
162
|
+
needs_font = false
|
163
|
+
fonts = encodings = nil
|
164
|
+
pagefiles.each do |p|
|
165
|
+
unless p.hocr_path.nil?
|
166
|
+
needs_font = true
|
167
|
+
break
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
if needs_font
|
172
|
+
fonts = Array.new()
|
173
|
+
encodings = [ [' '] ]
|
174
|
+
fdict = XObj.new( Hash[] )
|
175
|
+
@doc.addObject( fdict )
|
176
|
+
|
177
|
+
descr = XObj.new( Hash[
|
178
|
+
'Type' => '/FontDescriptor',
|
179
|
+
'BaseFont' => '/Times-Roman',
|
180
|
+
] )
|
181
|
+
@fdata.header.each_key do |key|
|
182
|
+
descr.addToDict( key,@fdata.header[key] )
|
183
|
+
end
|
184
|
+
@doc.addObject( descr )
|
185
|
+
end
|
186
|
+
|
187
|
+
pagefiles.each do |p|
|
188
|
+
procSet = ['/PDF', '/ImageB']
|
189
|
+
c_str = ''
|
190
|
+
doc_objs = Array.new()
|
191
|
+
lastimg = 0
|
192
|
+
|
193
|
+
width = p.width; height = p.height
|
194
|
+
xres = p.x_res; yres = p.y_res
|
195
|
+
pwidth = width.to_f / xres * 72
|
196
|
+
pheight = height.to_f / yres * 72
|
197
|
+
|
198
|
+
p.stencils.each do |s|
|
199
|
+
if st_format.eql? 'JBIG2'
|
200
|
+
xobj,width,height,xres,yres = loadJBIG2Page( s[:jbig2path],s[:jbig2dict],ref(ocFore.getID) )
|
201
|
+
else
|
202
|
+
xobj,width,height,xres,yres = loadCCITTPage( s[:path],ref(ocFore.getID) )
|
203
|
+
end
|
204
|
+
break if xobj.nil?
|
205
|
+
|
206
|
+
color = s[:rgb].join(' ') << ' rg'
|
207
|
+
doc_objs << xobj
|
208
|
+
|
209
|
+
c_str << "#{color} /Im#{lastimg} Do "
|
210
|
+
lastimg += 1
|
211
|
+
end
|
212
|
+
|
213
|
+
fg_image = bg_image = nil
|
214
|
+
fg_image = loadImage( p.fg_layer,ocFore.getID,procSet ) unless p.fg_layer.nil?
|
215
|
+
bg_image = loadImage( p.bg_layer,ocBack.getID,procSet ) unless p.bg_layer.nil?
|
216
|
+
|
217
|
+
contents = XObj.new(Hash[
|
218
|
+
'Filter' => '/FlateDecode'
|
219
|
+
])
|
220
|
+
resobj = XObj.new(Hash.new())
|
221
|
+
resources = XObj.new(Hash[
|
222
|
+
'XObject' => ref(resobj.getID)
|
223
|
+
])
|
224
|
+
|
225
|
+
unless fg_image.nil?
|
226
|
+
xobj = doc_objs[0]
|
227
|
+
fg_image.addToDict('SMask', ref(xobj.getID))
|
228
|
+
xobj.removeFromDict('ImageMask')
|
229
|
+
xobj.addToDict('Decode', '[1 0]')
|
230
|
+
resobj.addToDict('Im0', ref(fg_image.getID))
|
231
|
+
doc_objs << fg_image
|
232
|
+
c_str = '/Im0 Do '
|
233
|
+
else
|
234
|
+
doc_objs.each_index do |i|
|
235
|
+
resobj.addToDict( "Im#{i}", ref(doc_objs[i].getID) )
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
unless bg_image.nil?
|
240
|
+
c_str = "/Im#{resobj.dictLength} Do " << c_str
|
241
|
+
resobj.addToDict( "Im#{resobj.dictLength}", ref(bg_image.getID) )
|
242
|
+
doc_objs << bg_image
|
243
|
+
end
|
244
|
+
c_str = sprintf( "q %.2f 0 0 %.2f 0 0 cm %sQ",pwidth,pheight,c_str )
|
245
|
+
|
246
|
+
doc_objs.concat( [contents, resobj, resources] )
|
247
|
+
|
248
|
+
hocr = nil
|
249
|
+
unless p.hocr_path.nil?
|
250
|
+
hocr = open( p.hocr_path ) { |f| Hpricot.parse( f ) }
|
251
|
+
procSet << '/Text'
|
252
|
+
c_str << getPDFText( hocr,pheight,72.0/xres,72.0/yres,encodings )
|
253
|
+
end
|
254
|
+
|
255
|
+
contents.reinit( Hash[
|
256
|
+
'Filter' => '/FlateDecode'
|
257
|
+
], Zlib::Deflate.deflate( c_str,9 ) )
|
258
|
+
resources.addToDict( 'ProcSet', "[ #{procSet.join(' ')} ]" )
|
259
|
+
resources.addToDict( 'Font', ref( fdict.getID ) ) unless hocr.nil?
|
260
|
+
|
261
|
+
page = XObj.new(Hash[
|
262
|
+
'Type' => '/Page',
|
263
|
+
'Parent' => "#{pages.getID} 0 R",
|
264
|
+
'MediaBox' => sprintf( "[ 0 0 %.02f %.02f ]",pwidth,pheight ),
|
265
|
+
'Contents' => ref( contents.getID ),
|
266
|
+
'Resources' => ref( resources.getID )
|
267
|
+
])
|
268
|
+
# By default acroread uses /DeviceCMYK as a transparency blending space,
|
269
|
+
# so adding an SMask image to a page would result to colors being shifted,
|
270
|
+
# uless we take a special care of this. For more details see
|
271
|
+
# http://comments.gmane.org/gmane.comp.tex.pdftex/3747
|
272
|
+
unless fg_image.nil?
|
273
|
+
cspace = '/DeviceRGB'
|
274
|
+
cspace = fg_image.getFromDict( 'ColorSpace' ) if fg_image.hasInDict( 'ColorSpace' )
|
275
|
+
page.addToDict( 'Group', "<< /S /Transparency /CS #{cspace} >>" )
|
276
|
+
end
|
277
|
+
doc_objs << page
|
278
|
+
doc_objs.each{ |x| @doc.addObject(x) }
|
279
|
+
page_objs << page
|
280
|
+
|
281
|
+
pages.addToDict( 'Count', page_objs.length )
|
282
|
+
pages.addToDict( 'Kids', '[' << page_objs.map{|x| ref(x.getID).to_s}.join(' ') << ']' )
|
283
|
+
|
284
|
+
pkey = pidx + 1
|
285
|
+
pkey = labels.getPageLabel( cur_range_id,pidx ) if labels != nil and labels.length > 0
|
286
|
+
pages_by_num[pkey] = page.getID
|
287
|
+
pidx += 1
|
288
|
+
if labels != nil and labels.length > 0
|
289
|
+
if cur_range_id < labels.length - 1 and labels[cur_range_id + 1][:first] == pidx
|
290
|
+
cur_range_id += 1
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
$stderr.puts("Processed #{p.name}\n")
|
295
|
+
$stderr.puts(" Added background image from #{p.bg_layer}\n") unless bg_image.nil?
|
296
|
+
$stderr.puts(" Added foreground image from #{p.fg_layer}\n") unless fg_image.nil?
|
297
|
+
end
|
298
|
+
|
299
|
+
if needs_font
|
300
|
+
fidx = 1
|
301
|
+
encodings.each do |enc|
|
302
|
+
font = addFont( descr,enc,"Fnt#{fidx}" )
|
303
|
+
fdict.addToDict( "Fnt#{fidx}",ref(font.getID) )
|
304
|
+
fonts << font
|
305
|
+
fidx += 1
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
if toc != nil and toc.length > 0
|
310
|
+
getOutlineObjs( toc,pages_by_num,page_objs[0].getID )
|
311
|
+
cat.addToDict('Outlines', ref(toc[0][:pdfobj].getID))
|
312
|
+
cat.addToDict('PageMode', "/UseOutlines")
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# Output the created PDF file to the disk.
|
317
|
+
def output( outpath )
|
318
|
+
begin
|
319
|
+
if outpath.eql? 'STDOUT'
|
320
|
+
out = $stdout
|
321
|
+
else
|
322
|
+
out = File.open( outpath,'w' )
|
323
|
+
end
|
324
|
+
|
325
|
+
out.binmode if /(win|w)32$/.match( RUBY_PLATFORM )
|
326
|
+
out.write( @doc.to_s )
|
327
|
+
out.close unless outpath.eql? 'STDOUT'
|
328
|
+
rescue
|
329
|
+
$stderr.puts( "Error: could not write to #{outpath}" )
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
private
|
334
|
+
|
335
|
+
def parseMeta( path )
|
336
|
+
ret = Hash.new()
|
337
|
+
return ret if path.nil? or path.eql? ''
|
338
|
+
|
339
|
+
keys = [ 'Title', 'Author', 'Subject', 'Keywords' ]
|
340
|
+
File.open( path,'r' ) do |fin|
|
341
|
+
fin.set_encoding 'UTF-8' if fin.respond_to? :set_encoding
|
342
|
+
fin.each do |fl|
|
343
|
+
next if /^\#/.match( fl )
|
344
|
+
|
345
|
+
if /^\/?([A-Za-z]+)[ \t]*:[ \t]+\"(.*)\"/.match( fl )
|
346
|
+
key = $1
|
347
|
+
if keys.include? key
|
348
|
+
begin
|
349
|
+
ret[key] = Iconv.iconv( "utf-16be", "utf-8", $2 ).first
|
350
|
+
rescue
|
351
|
+
$stderr.puts("Error: metadata should be specified in utf-8")
|
352
|
+
end
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
ret
|
358
|
+
end
|
359
|
+
|
360
|
+
def getOutlineObjs( toc,page_ids,fp_id )
|
361
|
+
root = toc[0]
|
362
|
+
root[:pdfobj] = XObj.new( Hash[
|
363
|
+
'Type' => '/Outlines',
|
364
|
+
'Count' => root.getChildrenCount
|
365
|
+
])
|
366
|
+
@doc.addObject(root[:pdfobj])
|
367
|
+
|
368
|
+
toc[1..-1].each do |item|
|
369
|
+
dest = fp_id
|
370
|
+
if page_ids.has_key? item[:ref]
|
371
|
+
dest = page_ids[item[:ref]]
|
372
|
+
else
|
373
|
+
dest = nil
|
374
|
+
$stderr.puts("Malformed TOC: there is no page #{item[:ref]} in this document.")
|
375
|
+
end
|
376
|
+
|
377
|
+
item_text = item[:title].to_binary
|
378
|
+
item_text.sub!( /\x00\x28/,"\x00\x5C\x28" )
|
379
|
+
item_text.sub!( /\x00\x29/,"\x00\x5C\x29" )
|
380
|
+
item[:pdfobj] = XObj.new(Hash[
|
381
|
+
'Title' => "(\xFE\xFF#{item_text.to_text})",
|
382
|
+
'Parent' => ref(item[:parent][:pdfobj].getID),
|
383
|
+
])
|
384
|
+
if dest != nil
|
385
|
+
item[:pdfobj].addToDict('Dest', "[ #{dest} 0 R /XYZ null null null ]")
|
386
|
+
else
|
387
|
+
item[:pdfobj].addToDict('C', "[0.75 0.75 0.75]")
|
388
|
+
end
|
389
|
+
|
390
|
+
|
391
|
+
if item[:children].length > 0
|
392
|
+
cnt = item.getChildrenCount
|
393
|
+
if item[:open]
|
394
|
+
item[:pdfobj].addToDict('Count', cnt)
|
395
|
+
else
|
396
|
+
item[:pdfobj].addToDict('Count', -cnt)
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
unless item.has_key? :prev
|
401
|
+
item[:parent][:pdfobj].addToDict('First', ref(item[:pdfobj].getID))
|
402
|
+
else
|
403
|
+
item[:prev][:pdfobj].addToDict('Next', ref(item[:pdfobj].getID))
|
404
|
+
item[:pdfobj].addToDict('Prev', ref(item[:prev][:pdfobj].getID))
|
405
|
+
end
|
406
|
+
|
407
|
+
unless item.has_key? :next
|
408
|
+
item[:parent][:pdfobj].addToDict('Last', ref(item[:pdfobj].getID))
|
409
|
+
end
|
410
|
+
|
411
|
+
@doc.addObject(item[:pdfobj])
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
# Returns an array containing the coordinates of the bounding box around
|
416
|
+
# an element
|
417
|
+
def elementCoordinates( element,xscale,yscale )
|
418
|
+
out = [0,0,0,0]
|
419
|
+
|
420
|
+
if element.attributes.to_hash.has_key? 'title'
|
421
|
+
if /bbox((\s+\d+){4})/.match(element.attributes.to_hash['title'])
|
422
|
+
coords = $1.strip.split(/\s+/)
|
423
|
+
out = [ (coords[0].to_i*xscale).to_f,(coords[1].to_i*xscale).to_f,
|
424
|
+
(coords[2].to_i*yscale).to_f,(coords[3].to_i*yscale).to_f ]
|
425
|
+
end
|
426
|
+
end
|
427
|
+
return out
|
428
|
+
end
|
429
|
+
|
430
|
+
def getPDFText( hocr,pheight,xscale,yscale,encodings )
|
431
|
+
fsize = 10
|
432
|
+
cur_enc = encodings[0]
|
433
|
+
ret = " BT 3 Tr /Fnt1 #{fsize} Tf "
|
434
|
+
|
435
|
+
charset = 'utf-8'
|
436
|
+
hocr.search("//meta[@http-equiv='Content-Type']").each do |el|
|
437
|
+
attrs = el.attributes.to_hash
|
438
|
+
charset = $1 if attrs.has_key? 'content' and
|
439
|
+
/\Atext\/html;charset=([A-Za-z0-9-]+)\Z/i.match( attrs['content'] )
|
440
|
+
end
|
441
|
+
|
442
|
+
hocr.search("//span[@class='ocr_line']").each do |line|
|
443
|
+
txt = line.to_plain_text.strip.sub( /[\n\r]+/,' ' )
|
444
|
+
begin
|
445
|
+
txt = Iconv.iconv( 'utf-8',charset,txt ).first unless charset.downcase.eql? 'utf-8'
|
446
|
+
rescue
|
447
|
+
txt = ''
|
448
|
+
end
|
449
|
+
next if txt.eql? ''
|
450
|
+
txt.force_encoding( 'utf-8' ) if txt.respond_to? :force_encoding
|
451
|
+
txt.sub!( /-\Z/, "\xC2\xAD" )
|
452
|
+
|
453
|
+
bbox = elementCoordinates( line,xscale,yscale )
|
454
|
+
ratio = ( bbox[2] - bbox[0] ) / @fdata.getLineWidth( txt,fsize )
|
455
|
+
ret << sprintf( "%f %f %f %f %f %f Tm ",
|
456
|
+
ratio, 0, 0, ratio, bbox[0], pheight - bbox[3] - @fdata.header['Descent'] * fsize / 1000.0)
|
457
|
+
|
458
|
+
txt8 = ''
|
459
|
+
txt.each_char do |char|
|
460
|
+
begin
|
461
|
+
Iconv.iconv( "utf-16be","utf-8",char )
|
462
|
+
rescue
|
463
|
+
rawbytes = char.unpack( 'C*' )
|
464
|
+
bs = ''
|
465
|
+
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
466
|
+
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
467
|
+
char = '?' * rawbytes.length
|
468
|
+
end
|
469
|
+
|
470
|
+
encoded = false
|
471
|
+
unless cur_enc.include? char
|
472
|
+
encodings.each_index do |i|
|
473
|
+
enc = encodings[i]
|
474
|
+
next if enc == cur_enc
|
475
|
+
|
476
|
+
if enc.include? char
|
477
|
+
ret << "<#{txt8}> Tj "
|
478
|
+
cur_enc = enc
|
479
|
+
ret << "/Fnt#{i + 1} #{fsize} Tf "
|
480
|
+
txt8 = ''
|
481
|
+
encoded = true
|
482
|
+
break
|
483
|
+
end
|
484
|
+
end
|
485
|
+
|
486
|
+
unless encoded
|
487
|
+
last = encodings[-1]
|
488
|
+
if last.length < 256
|
489
|
+
last << char
|
490
|
+
else
|
491
|
+
last = [ ' ',char ]
|
492
|
+
encodings << last
|
493
|
+
end
|
494
|
+
|
495
|
+
if cur_enc != last
|
496
|
+
ret << "<#{txt8}> Tj "
|
497
|
+
cur_enc = last
|
498
|
+
ret << "/Fnt#{encodings.length} #{fsize} Tf "
|
499
|
+
txt8 = ''
|
500
|
+
end
|
501
|
+
end
|
502
|
+
end
|
503
|
+
|
504
|
+
txt8 << sprintf( "%02X",cur_enc.index(char) )
|
505
|
+
end
|
506
|
+
|
507
|
+
ret << "<#{txt8}> Tj " unless txt8.eql? ''
|
508
|
+
end
|
509
|
+
|
510
|
+
ret << "ET "
|
511
|
+
return ret
|
512
|
+
end
|
513
|
+
|
514
|
+
def addFont( descr,fenc,fname )
|
515
|
+
enc_str = @fdata.getEncoding( fenc ).join( ' ' )
|
516
|
+
enc = XObj.new( Hash[
|
517
|
+
'Type' => "/Encoding",
|
518
|
+
'Differences' => "[ 0 #{enc_str} ]"
|
519
|
+
])
|
520
|
+
@doc.addObject( enc )
|
521
|
+
|
522
|
+
toUni = @fdata.getCMAP( fenc )
|
523
|
+
@doc.addObject( toUni )
|
524
|
+
|
525
|
+
font = XObj.new( Hash[
|
526
|
+
'BaseFont' => '/Times-Roman',
|
527
|
+
'Name' => "/#{fname}",
|
528
|
+
'Subtype' => '/Type1',
|
529
|
+
'Type' => '/Font',
|
530
|
+
'FirstChar' => 0,
|
531
|
+
'LastChar' => fenc.length - 1,
|
532
|
+
'Widths' => '[ ' << @fdata.getWidths(fenc).map{|w| w.to_s}.join(' ') << ' ]',
|
533
|
+
'FontDescriptor' => ref(descr.getID),
|
534
|
+
'ToUnicode' => ref(toUni.getID),
|
535
|
+
] )
|
536
|
+
if enc.nil?
|
537
|
+
font.addToDict( 'Encoding','/WinAnsiEncoding' )
|
538
|
+
else
|
539
|
+
font.addToDict( 'Encoding',ref(enc.getID) )
|
540
|
+
end
|
541
|
+
@doc.addObject( font )
|
542
|
+
return font
|
543
|
+
end
|
544
|
+
|
545
|
+
def loadCCITTPage( path,ocref )
|
546
|
+
stencil = ImageInspector.new( path )
|
547
|
+
return nil if stencil.width.nil?
|
548
|
+
|
549
|
+
width = stencil.width
|
550
|
+
height = stencil.height
|
551
|
+
xres = stencil.x_dpi
|
552
|
+
yres = stencil.y_dpi
|
553
|
+
rows_per_strip = stencil.tags[0x116][0]
|
554
|
+
|
555
|
+
unless stencil.compression.eql? :CCITTFaxDecode and rows_per_strip >= height
|
556
|
+
img = ImageList.new( path )
|
557
|
+
imgdata = img.to_blob{
|
558
|
+
self.format = 'TIFF'
|
559
|
+
self.define( 'TIFF','rows-per-strip',height )
|
560
|
+
self.compression = Group4Compression
|
561
|
+
}
|
562
|
+
stencil = ImageInspector.new( StringIO.new(imgdata) )
|
563
|
+
img.destroy!
|
564
|
+
end
|
565
|
+
body = stencil.getRawData
|
566
|
+
|
567
|
+
xobj = XObj.new(Hash[
|
568
|
+
'Type' => '/XObject',
|
569
|
+
'Subtype' => '/Image',
|
570
|
+
'OC' => ocref,
|
571
|
+
'Width' => width.to_s,
|
572
|
+
'Height' => height.to_s,
|
573
|
+
'ImageMask' => 'true',
|
574
|
+
'ColorSpace' => '/DeviceGray',
|
575
|
+
'BitsPerComponent' => '1',
|
576
|
+
'Filter' => '/CCITTFaxDecode',
|
577
|
+
'DecodeParms' => "<< /Columns #{width} /K -1 >>"
|
578
|
+
], body)
|
579
|
+
|
580
|
+
return [ xobj,width,height,xres,yres ]
|
581
|
+
end
|
582
|
+
|
583
|
+
def loadJBIG2Page( path,dictpath,ocref )
|
584
|
+
begin
|
585
|
+
jbig2 = File.open( path,'r' ).read
|
586
|
+
width, height, xres, yres = jbig2[11...27].unpack( 'NNNN' )
|
587
|
+
unless @dictpath.eql? dictpath
|
588
|
+
symd_f = File.open( dictpath,'r' ).read
|
589
|
+
symd_o = @doc.addObject( XObj.new(Hash.new(),symd_f) )
|
590
|
+
@dictpath = dictpath
|
591
|
+
@dictobj = symd_o
|
592
|
+
end
|
593
|
+
rescue
|
594
|
+
$stderr.puts( "Page not completed: could not access #{path}" )
|
595
|
+
return nil
|
596
|
+
end
|
597
|
+
|
598
|
+
xobj = XObj.new(Hash[
|
599
|
+
'Type' => '/XObject',
|
600
|
+
'Subtype' => '/Image',
|
601
|
+
'OC' => ocref,
|
602
|
+
'Width' => width.to_s,
|
603
|
+
'Height' => height.to_s,
|
604
|
+
'ImageMask' => 'true',
|
605
|
+
'ColorSpace' => '/DeviceGray',
|
606
|
+
'BitsPerComponent' => '1',
|
607
|
+
'Filter' => '/JBIG2Decode',
|
608
|
+
'DecodeParms' => "<< /JBIG2Globals #{@dictobj.getID} 0 R >>"
|
609
|
+
], jbig2)
|
610
|
+
|
611
|
+
return [ xobj,width,height,xres,yres ]
|
612
|
+
end
|
613
|
+
|
614
|
+
def loadImage( impath,ocID,procSet )
|
615
|
+
insp = ImageInspector.new( impath )
|
616
|
+
return nil if insp.width.nil?
|
617
|
+
|
618
|
+
# JPEG, JPEG2000 and PNG images can be handled directly. We also can
|
619
|
+
# handle uncompressed TIFF files, although it is very unlikely someone
|
620
|
+
# would use them for page background. Unfortunately things are more
|
621
|
+
# difficult for compressed TIFF images, as they normally contain several
|
622
|
+
# compressed chunks, so that we can't just concatenate them. So for all
|
623
|
+
# other image types we just call ImageMagick to convert them into a
|
624
|
+
# zip-compressed PNG, and then retrieve the raw data from that PNG image.
|
625
|
+
unless [ :JPEG, :JPEG2000, :PNG ].include? insp.format or
|
626
|
+
( insp.format.eql? :TIFF and ( insp.compression.eql? :NoCompression or
|
627
|
+
( [ :FlateDecode,:LZWDecode,:CCITTFaxDecode ].include? insp.compression and insp.tags[0x0116][0] >= insp.height )))
|
628
|
+
|
629
|
+
img = ImageList.new( impath )
|
630
|
+
imgdata = img.to_blob{
|
631
|
+
self.format = 'PNG'
|
632
|
+
self.quality = 95
|
633
|
+
self.compression = ZipCompression
|
634
|
+
}
|
635
|
+
insp = ImageInspector.new( StringIO.new(imgdata) )
|
636
|
+
img.destroy!
|
637
|
+
end
|
638
|
+
rawdata = insp.getRawData
|
639
|
+
cspace = "/#{insp.cspace}"
|
640
|
+
fmt = insp.format
|
641
|
+
imgcompr = insp.compression
|
642
|
+
per_comp = 1
|
643
|
+
|
644
|
+
if cspace.eql? '/Indexed' and not insp.palette.nil?
|
645
|
+
cspace = '/DeviceGray'; cpal = insp.palette
|
646
|
+
rgb = false
|
647
|
+
cpal.each do |c|
|
648
|
+
if c[0] != c[1] or c[0] != c[2]
|
649
|
+
cspace = '/DeviceRGB'
|
650
|
+
rgb = true
|
651
|
+
break
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
cspace = "[/Indexed #{cspace} #{cpal.length - 1} < "
|
656
|
+
cpal.each do |c|
|
657
|
+
cspace << sprintf( "%02x ",c[0] )
|
658
|
+
cspace << sprintf( "%02x %02x ",c[1],c[2] ) if rgb
|
659
|
+
end
|
660
|
+
cspace << '>]'
|
661
|
+
|
662
|
+
procSet << '/ImageI' unless procSet.include? '/ImageI'
|
663
|
+
|
664
|
+
elsif not cspace.eql? '/DeviceGray' and not procSet.include? '/ImageC'
|
665
|
+
procSet << '/ImageC'
|
666
|
+
end
|
667
|
+
|
668
|
+
if cspace.eql? '/DeviceRGB'
|
669
|
+
per_comp = 3
|
670
|
+
elsif cspace.eql? '/DeviceCMYK'
|
671
|
+
per_comp = 4
|
672
|
+
end
|
673
|
+
image = XObj.new( Hash[
|
674
|
+
'Type' => '/XObject',
|
675
|
+
'Subtype' => '/Image',
|
676
|
+
'OC' => ref( ocID ),
|
677
|
+
'Width' => insp.width,
|
678
|
+
'Height' => insp.height,
|
679
|
+
'Interpolate' => 'true'
|
680
|
+
], rawdata )
|
681
|
+
|
682
|
+
unless fmt.eql? :JPEG2000
|
683
|
+
image.addToDict( 'BitsPerComponent',insp.depth )
|
684
|
+
image.addToDict( 'ColorSpace',"#{cspace}" )
|
685
|
+
end
|
686
|
+
image.addToDict( 'Filter',"/#{imgcompr}" ) unless insp.compression.eql? :NoCompression
|
687
|
+
if [:PNG, :TIFF].include? fmt
|
688
|
+
predictor = (fmt.eql? :PNG) ? 15 : 2
|
689
|
+
image.addToDict( 'DecodeParms',
|
690
|
+
"<< /Predictor #{predictor} /Colors #{per_comp} /BitsPerComponent #{insp.depth} /Columns #{insp.width} >>" )
|
691
|
+
end
|
692
|
+
return image
|
693
|
+
end
|
694
|
+
|
695
|
+
def ref(x)
|
696
|
+
return "#{x} 0 R"
|
697
|
+
end
|
698
|
+
end
|
699
|
+
|