pdfbeads 1.0.9 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog +23 -0
- data/bin/pdfbeads +28 -3
- data/doc/pdfbeads.en.html +552 -0
- data/doc/pdfbeads.ru.html +74 -34
- data/lib/pdfbeads.rb +17 -6
- data/lib/pdfbeads/pdfbuilder.rb +254 -74
- data/lib/pdfbeads/pdfpage.rb +8 -8
- data/lib/pdfbeads/pdftoc.rb +7 -3
- metadata +80 -48
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 577a47277a3e474a47b740ce93ca89041402c4fb
|
4
|
+
data.tar.gz: 88a1f950e31e41e47f79ef978d9e589e1b9724fb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 16ab45bd0c4d63d7c6f567f3df5e2dea2ecc1e9237ac9b5acfc9cd92e31a72a8df7ad184e5c7fd779cb990f58ff64dbb3027464c7579fd79d58fc270ac01c841
|
7
|
+
data.tar.gz: b946c901407f06aeaf509988341759ba4f4cfc217067b00fb72ad5bc9ae60e4d1dd0729bb7a3761a6c06f71816dcb9398e2cde4335ee8a5620c1686e208fe901
|
data/ChangeLog
CHANGED
@@ -55,3 +55,26 @@
|
|
55
55
|
* Don't attempt to use 'ocrx_word' elements which contain no bounding box
|
56
56
|
data (this should fix the problem with the hOCR output produced by some
|
57
57
|
tesseract versions).
|
58
|
+
|
59
|
+
2013 Mar 20 (Alexey Kryukov) Version 1.1.0
|
60
|
+
|
61
|
+
+ It is now possible to take the text layer from another PDF document (normally
|
62
|
+
this would be a file produced by passing the same set of images to an
|
63
|
+
OCR application) and embed it into the pdfbeads output. Warning: this feature
|
64
|
+
has been tested so far only with files produced with ABBYY FineReader. It may or
|
65
|
+
may not work with PDF files generated by other OCR programs.
|
66
|
+
|
67
|
+
* The default PDF page layout is now "OneColumn".
|
68
|
+
|
69
|
+
+ Make it possible to specify that the preferred reading direction for the
|
70
|
+
PDF document is left-to-right.
|
71
|
+
|
72
|
+
+ In order to simplify debugging of resulting files I have added a special
|
73
|
+
flag allowing to make the hidden text layer visible and to disable
|
74
|
+
compression in page streams.
|
75
|
+
|
76
|
+
2014 Jan 26 (Alexey Kryukov) Version 1.1.1
|
77
|
+
|
78
|
+
* hpricot is no longer developed, so switch to Nokagiri for hOCR processing.
|
79
|
+
|
80
|
+
+ English HTML documentation added.
|
data/bin/pdfbeads
CHANGED
@@ -41,9 +41,12 @@ include PDFBeads
|
|
41
41
|
pdfargs = Hash[
|
42
42
|
:labels => nil,
|
43
43
|
:toc => nil,
|
44
|
-
:pagelayout => '
|
44
|
+
:pagelayout => 'OneColumn',
|
45
45
|
:meta => nil,
|
46
|
-
:
|
46
|
+
:textpdf => nil,
|
47
|
+
:delfiles => false,
|
48
|
+
:debug => false,
|
49
|
+
:rtl => false
|
47
50
|
]
|
48
51
|
pageargs = Hash[
|
49
52
|
:threshold => 1,
|
@@ -87,6 +90,23 @@ OptionParser.new() do |opts|
|
|
87
90
|
|
88
91
|
pdfargs[:pagelayout] = pagelayout
|
89
92
|
end
|
93
|
+
opts.on("-R", "--right-to-left",
|
94
|
+
"Set the flag indicating that the preferred reading",
|
95
|
+
"direction for the resulting PDF file is right to left") do |rtl|
|
96
|
+
pdfargs[:rtl] = rtl
|
97
|
+
end
|
98
|
+
opts.on("-T", "--text-pdf PDFFILE",
|
99
|
+
"Specify a PDF file produced by passing the same set",
|
100
|
+
"of files to an OCR program. Pdfbeads will use that file",
|
101
|
+
"to generate the hidden text layer for its PDF output.") do |pdffile|
|
102
|
+
|
103
|
+
if $has_pdfreader
|
104
|
+
pdfargs[:textpdf] = pdffile
|
105
|
+
else
|
106
|
+
$stderr.puts( "Warning: the pdf/reader extension is not available." )
|
107
|
+
$stderr.puts( "\tthe -T/--text-pdf option is ignored." )
|
108
|
+
end
|
109
|
+
end
|
90
110
|
|
91
111
|
opts.separator "\n"
|
92
112
|
opts.separator "Image encoding and compression options:\n"
|
@@ -147,7 +167,7 @@ OptionParser.new() do |opts|
|
|
147
167
|
"Compression method for background images. Acceptable",
|
148
168
|
"values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
|
149
169
|
"JP2 is used by default, unless this format is not",
|
150
|
-
"supported by the available version
|
170
|
+
"supported by the available ImageMagick version" ) do |format|
|
151
171
|
case format.upcase
|
152
172
|
when 'JP2', 'JPX', 'J2K', 'JPEG2000'
|
153
173
|
pageargs[:bg_format] = 'JP2'
|
@@ -178,6 +198,11 @@ OptionParser.new() do |opts|
|
|
178
198
|
"Print output to a file instead of STDERR") do |f|
|
179
199
|
outpath = f
|
180
200
|
end
|
201
|
+
opts.on("-D", "--debug",
|
202
|
+
"Simplify debugging the PDF output by making the hidden",
|
203
|
+
"text layer visible and using uncompressed page streams") do |dbg|
|
204
|
+
pdfargs[:debug] = dbg
|
205
|
+
end
|
181
206
|
opts.on_tail("-h", "--help", "Show this message") do
|
182
207
|
puts opts
|
183
208
|
exit
|
@@ -0,0 +1,552 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
|
5
|
+
<title>PDFBeads -- convert scanned images to a single PDF file</title>
|
6
|
+
|
7
|
+
<meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
|
8
|
+
|
9
|
+
<meta name="Generator" content="Written directly in html">
|
10
|
+
|
11
|
+
<meta name="Description" content="pdfbeads v. 1.1 User's Manual">
|
12
|
+
|
13
|
+
<style type="text/css">
|
14
|
+
body {
|
15
|
+
font-family: Times New Roman, Times, serif;
|
16
|
+
text-align: justify;
|
17
|
+
}
|
18
|
+
a:link {
|
19
|
+
color: blue; text-decoration: underline
|
20
|
+
}
|
21
|
+
a:hover {
|
22
|
+
color: fuchsia
|
23
|
+
}
|
24
|
+
a:active {
|
25
|
+
color: fuchsia
|
26
|
+
}
|
27
|
+
a:visited {
|
28
|
+
color: purple
|
29
|
+
}
|
30
|
+
h1 {
|
31
|
+
font-size: 36px;
|
32
|
+
font-family: Times New Roman, Times, serif;
|
33
|
+
text-align: center;
|
34
|
+
font-style: normal;
|
35
|
+
font-weight: bold
|
36
|
+
}
|
37
|
+
h2 {
|
38
|
+
font-size: 20px;
|
39
|
+
font-family: Arial, Helvetica, sans-serif;
|
40
|
+
text-align: center;
|
41
|
+
font-style: normal;
|
42
|
+
font-weight: bold;
|
43
|
+
}
|
44
|
+
h3 {
|
45
|
+
font-size: 16px;
|
46
|
+
font-family: Arial, Helvetica, sans-serif;
|
47
|
+
text-align: left;
|
48
|
+
font-style: italic;
|
49
|
+
font-weight: bold;
|
50
|
+
}
|
51
|
+
dt {
|
52
|
+
font-weight: bold;
|
53
|
+
}
|
54
|
+
</style>
|
55
|
+
|
56
|
+
</head>
|
57
|
+
|
58
|
+
<body>
|
59
|
+
|
60
|
+
<h1>pdfbeads v. 1.1 User's Manual</h1>
|
61
|
+
|
62
|
+
<p>(c) Alexey Kryukov, 2013</p>
|
63
|
+
|
64
|
+
<p>pdfbeads is a small utility intended for creating e-books in PDF format
|
65
|
+
from specially prepared scanned pages. Unlike other similar utilities,
|
66
|
+
pdfbeads attempts to implement the approach more commonly used for DjVu files.
|
67
|
+
Its key feature is separating the scanned page into distinct layers, each
|
68
|
+
layer having its own compression format and resolution.</p>
|
69
|
+
|
70
|
+
<p>Here are some features of pdfbeads:</p>
|
71
|
+
|
72
|
+
<ul>
|
73
|
+
|
74
|
+
<li><p>JBIG2 and JPEG2000 graphical data compression;</p></li>
|
75
|
+
|
76
|
+
<li><p>separating out text and image layers from “mixed” files produced with
|
77
|
+
<a href="http://scantailor.sourceforge.net/">ScanTailor</a>;</p></li>
|
78
|
+
|
79
|
+
<li><p>adding halftone background images to text pages previously
|
80
|
+
converted to B&W;</p></li>
|
81
|
+
|
82
|
+
<li><p>processing indexed images with a limitedо (small) number of colors
|
83
|
+
so that the colors are preserved and the content is placed into the foreground
|
84
|
+
layer;</p></li>
|
85
|
+
|
86
|
+
<li><p>separating out background and foreground data by masking a source
|
87
|
+
color image;</p></li>
|
88
|
+
|
89
|
+
<li><p>producing PDF files with a table of contents and metadata;</p></li>
|
90
|
+
|
91
|
+
<li><p>adding a hidden text layer produced from previously generated hOCR
|
92
|
+
files or transferring the text data from another PDF file.</p></li>
|
93
|
+
|
94
|
+
</ul>
|
95
|
+
|
96
|
+
<p>The program has been called pdfbeads because building an e-book from
|
97
|
+
separate graphical data seems to me a bit similar to threading beads on a
|
98
|
+
string. Moreover, this name seems appropriate for a Ruby script, since
|
99
|
+
it has something to do with jewelry where rubies (like other gems) are
|
100
|
+
used.</p>
|
101
|
+
|
102
|
+
<h2>Requirements</h2>
|
103
|
+
|
104
|
+
<p>In order to run the program you’ll need first the Ruby runtime
|
105
|
+
environment v. 1.8 or above, which is available by default on most Unix-like
|
106
|
+
systems. A Windows installable package is available at the
|
107
|
+
<a href="http://www.rubyinstaller.org/">RubyInstaller</a> site. You will
|
108
|
+
also need RubyGems (the standard Ruby package manager) and some extensions
|
109
|
+
available (like pdfbeads itself) via the RubyGems framework, namely RMagick,
|
110
|
+
Nokogiri and PDF::Reader. Note that pdfbeads will may even if the last two
|
111
|
+
packages are not available. However, without Nokogiri it will not be possible
|
112
|
+
to read optically recognized text from hOCR files, while PDF::Reader is needed
|
113
|
+
in order to be able to import the text layer from another PDF file.</p>
|
114
|
+
|
115
|
+
<p>If you are interested in creating PDF files using the JBIG2 data compression
|
116
|
+
format, then your system should have also the jbig2 utility from the
|
117
|
+
<a href="http://github.com/agl/jbig2enc">jbig2enc</a> package installed.</p>
|
118
|
+
|
119
|
+
<h2>Installation</h2>
|
120
|
+
|
121
|
+
<p>Downloading and installing the most recent pdfbeads version with the
|
122
|
+
RubyGems package manager is quite simple. Just type in your command line:</p>
|
123
|
+
|
124
|
+
<pre>
|
125
|
+
gem install pdfbeads
|
126
|
+
</pre>
|
127
|
+
|
128
|
+
<p>Before running the script you should ensure the RMagick extension
|
129
|
+
is installed and accessible to the Ruby runtime. <strong>Unfortunately,
|
130
|
+
it is not currently possible to automatically resolve this dependency</strong>,
|
131
|
+
since in some Linux distributions (Ubuntu in particular) the standard
|
132
|
+
installation of the RMagick package circumvents the RubyGems engine, so that
|
133
|
+
the <tt>gem</tt> utility knows nothing about it.</p>
|
134
|
+
|
135
|
+
<p>Ubuntu users should also take into account that in this distribution
|
136
|
+
executable files from gem packages are unpacked into the
|
137
|
+
<tt>/var/lib/gems/<RUBY_VERSION>/bin</tt> directory, which is not
|
138
|
+
included into the PATH environment variable by default. So in order to be
|
139
|
+
able to run pdfbeads without specifying the full path to the executable file
|
140
|
+
you should either modify the PATH variable as necessary, or move the
|
141
|
+
<tt>pdfbeads</tt> script into some directory normally used for executables
|
142
|
+
(<tt>/usr/local/bin</tt> for example).</p>
|
143
|
+
|
144
|
+
<h2>Basic principles</h2>
|
145
|
+
|
146
|
+
<p>pdfbeads workflow is based upon making a difference between a “main” image
|
147
|
+
representing a core of the PDF page and various auxiliary files related
|
148
|
+
with the current page.</p>
|
149
|
+
|
150
|
+
<p>Those files which contain scanned text supposed to be placed into the
|
151
|
+
foreground layer are considered “main”. Images used for this purpose
|
152
|
+
should normally be previously converted to bitonal. pdfbeads is also
|
153
|
+
able to process indexed images with a limited number of colors and a white
|
154
|
+
or transparent background, as well as “mixed” images where bitonal text is
|
155
|
+
combined with halftone pictures. The latter feature is most useful for
|
156
|
+
postprocessing files produced with
|
157
|
+
<a href="http://scantailor.sourceforge.net/">ScanTailor</a>.</p>
|
158
|
+
|
159
|
+
<p>A special treatment is applied to files with double extension, where
|
160
|
+
one of the following suffixes precedes an extension typical for a common
|
161
|
+
graphical format (TIF(F), PNG, JP(E)G, JP2 or JPX):</p>
|
162
|
+
|
163
|
+
<dl>
|
164
|
+
|
165
|
+
<dt>bg or sep</dt>
|
166
|
+
<dd><p>A background image (halftone or indexed);</p></dd>
|
167
|
+
|
168
|
+
<dt>fg</dt>
|
169
|
+
<dd><p>An image supposed to be used to color the foreground layer (like
|
170
|
+
a FG44 chunk in a DJVU file);</p></dd>
|
171
|
+
|
172
|
+
<dt>color</dt>
|
173
|
+
<dd><p>A color image supposed to be used as a source for producing
|
174
|
+
images with the <tt>*.bg.*</tt> and <tt>*.fg.*</tt> suffixes;</p></dd>
|
175
|
+
|
176
|
+
<dt>An RGB color specification (e. g. <tt>black</tt> or <tt>#ff00ff</tt>)</dt>
|
177
|
+
<dd><p>A bitonal image supposed to be displayed with the given color in the
|
178
|
+
target PDF file.</p></dd>
|
179
|
+
|
180
|
+
</dl>
|
181
|
+
|
182
|
+
<p>Furthermore, if the current directory contains any hOCR files with
|
183
|
+
recognized text (their extensions should be either HTM(L) or HOCR),
|
184
|
+
pdfbeads will attempt to use them for building hidden text layer
|
185
|
+
in its PDF output.</p>
|
186
|
+
|
187
|
+
<p>Some of the auxiliary files which have been mentioned above may be
|
188
|
+
produced by pdfbeads during an intermediate stage of its work. Since
|
189
|
+
processing images with the ImageMagick library (which pdfbeads is based on)
|
190
|
+
can take quite a long time, those files are not removed afterwards from
|
191
|
+
the hard drive and may be reused on subsequent runs for time saving.
|
192
|
+
In order to force pdfbeads to recreate those files you may run it with the
|
193
|
+
<tt>-f</tt> (or <tt>--force-update</tt>) option.</p>
|
194
|
+
|
195
|
+
<p>pdfbeads is supposed to be used for building PDF files from previously
|
196
|
+
processed scanned images, and this is the reason which explains some
|
197
|
+
of its features and limitations:</p>
|
198
|
+
|
199
|
+
<ul>
|
200
|
+
|
201
|
+
<li><p>it is not possible to somehow modify scanned images of text pages
|
202
|
+
(except forcing them to a specific DPI value), for they are supposed to
|
203
|
+
be created with the settings the user would like to get, so that there is
|
204
|
+
no need to additionally process them;</p></li>
|
205
|
+
|
206
|
+
<li><p>it is not possible as well to convert color or grayscale scanned images
|
207
|
+
to bitonal. The only exception is those situations (like splitting “mixed”
|
208
|
+
pages where bitonal text areas are combined with halftone pictures or separating out
|
209
|
+
background and foreground data from a source color image by applying a mask)
|
210
|
+
where pdfbeads just finishes the job started by some other applications;</p></li>
|
211
|
+
|
212
|
+
<li><p>any background images taken directly from user’s hard drive are
|
213
|
+
encoded “as is” without any additional processing.</p></li>
|
214
|
+
|
215
|
+
</ul>
|
216
|
+
|
217
|
+
<h2>Getting started</h2>
|
218
|
+
|
219
|
+
<p>The generic command line syntax is as follows:</p>
|
220
|
+
|
221
|
+
<pre>
|
222
|
+
pdfbeads [options] [files to process] [> output_file.pdf]
|
223
|
+
</pre>
|
224
|
+
|
225
|
+
<p>The list of files to be processed may be either obtained from the
|
226
|
+
current directory listing or directly specified in the command line.
|
227
|
+
However in both cases <strong>pdfbeads accepts for processing only those
|
228
|
+
files whose names match a specific pattern:</strong> the extension should
|
229
|
+
be either TIF(F) or PNG (the case doesn’t matter) and there should be
|
230
|
+
no dots inside the base name (i. e. double extensions are not
|
231
|
+
allowed). The reason for this limitation is that the program, as explained
|
232
|
+
above, uses dot-separated file name suffixes to denote some types
|
233
|
+
of auxiliary files, accompanying the scanned text page itself.</p>
|
234
|
+
|
235
|
+
<p>Instead of writing the resulting PDF file to the standard output
|
236
|
+
stream one can use the <tt>-o</tt> (or <tt>--output</tt>) option
|
237
|
+
followed by the name of the file to be created.</p>
|
238
|
+
|
239
|
+
<h2>Processing bitonal images</h2>
|
240
|
+
|
241
|
+
<p>The foreground layer of a PDF page, or its “mask”, is created from the
|
242
|
+
“main” scanned page file passed to pdfbeads. The following rules are applied
|
243
|
+
here:</p>
|
244
|
+
|
245
|
+
<ul>
|
246
|
+
|
247
|
+
<li><p>TIFF or PNG images already converted to bitonal are used “as
|
248
|
+
is”;</p></li>
|
249
|
+
|
250
|
+
<li><p>pages with mixed content are cleared from any halftone pictures
|
251
|
+
(their processing is described in the next section), while the remaining
|
252
|
+
bitonal image is saved into a file with the same base name and the
|
253
|
+
<tt>black.tiff</tt> extension. That’s the <tt>black.tiff</tt> image file
|
254
|
+
which is further used to produce the foreground layer for such a page;</p></li>
|
255
|
+
|
256
|
+
<li><p>indexed images with a white or transparent background which contain
|
257
|
+
a small number of colors (4 by default; this value can be changed via the
|
258
|
+
<tt>-x</tt> (<tt>--max-colors</tt>) option) are splitted into several
|
259
|
+
bitonal image files according to the number of colors. Each of those
|
260
|
+
files is further encoded separately, so that the resulting PDF page
|
261
|
+
will have several foreground layers, each with its own color. NB: use an
|
262
|
+
indexed PNG image with a transparent background if you want to produce
|
263
|
+
a PDF page with a white-colored text.</p></li>
|
264
|
+
|
265
|
+
</ul>
|
266
|
+
|
267
|
+
<p>It is recommended to encode bitonal text pages as CCITT Group 4 fax
|
268
|
+
compressed TIFF files, since pdfbeads is usually able to read the image
|
269
|
+
data from such files without using the ImageMagick library, thus making
|
270
|
+
the processing speed significantly faster.</p>
|
271
|
+
|
272
|
+
<p>By default pdfbeads attempts to apply JBIG2 compression to the foreground
|
273
|
+
layer, using Adam Langley’s <a href="http://github.com/agl/jbig2enc">jbig2enc</a>
|
274
|
+
utility. You can run pdfbeads with the <tt>-p</tt> (<tt>--pages-per-dict</tt>)
|
275
|
+
option in order to directly specify the desired number of PDF document
|
276
|
+
pages using the common dictionary of shared symbols (15 by default).</p>
|
277
|
+
|
278
|
+
<p>If jbig2enc is not accessible to pdfbeads, then the CCITT Group 4 fax
|
279
|
+
compression method will be used instead. It is also possible to explicitly
|
280
|
+
request this compression type by specifying the <tt>-m</tt>
|
281
|
+
(<tt>--mask-compression</tt>) option with the `G4' parameter (or its synonyms:
|
282
|
+
`Group4', `CCITTFax').</p>
|
283
|
+
|
284
|
+
<h2>Processing halftone images</h2>
|
285
|
+
|
286
|
+
<p>Halftone images are placed into the background layer of a PDF page.
|
287
|
+
This layer is normally supposed to have a lower resolution than its mask.
|
288
|
+
pdfbeads can either take a background image directly from the hard drive
|
289
|
+
(i. e. from a file with a <strong>bg</strong> or <strong>sep</strong>
|
290
|
+
extension suffix), or produce it by splitting a mixed image file.</p>
|
291
|
+
|
292
|
+
<p>When processing mixed image files pdfbeads first separates pictures
|
293
|
+
from text areas. This is achieved by filling any black pixels with white.
|
294
|
+
The resulting image is saved into the hard drive by a such thay, that the
|
295
|
+
following commkand line options are taken into account:</p>
|
296
|
+
|
297
|
+
<dl>
|
298
|
+
<dt>-b, --bg-compression</dt>
|
299
|
+
<dd><p>The data compression format. The fololowing values are allowed:
|
300
|
+
`JPEG2000' (also `JP2' also `JPX'), `JPEG' (or `JPG') and `LOSSLESS'
|
301
|
+
(synonyms are `DEFLATE' and `PNG'). pdfbeads will attempt to use the JPEG2000
|
302
|
+
compression by default. However it falls back to JPEG if JPEG2000 format
|
303
|
+
is not supported by the currently used ImageMagick build (which is often
|
304
|
+
the case). If the option has been set to LOSSLESS, then pdfbeads will
|
305
|
+
compress background images with the deflate method. Of course this choice
|
306
|
+
would normally result into producing a much larger output file than with
|
307
|
+
JPEG2000 or JPEG.</p></dd>
|
308
|
+
|
309
|
+
<dt>-B, --bg-resolution DPI</dt>
|
310
|
+
<dd><p>The resolution for the background. Reasonable values usually lie
|
311
|
+
between 150 and 300 dpi (300 by default).</p></dd>
|
312
|
+
|
313
|
+
<dt>-g, --grayscale</dt>
|
314
|
+
<dd><p>Forces pdfbeads to convert color images into grayscale. This option
|
315
|
+
would be useful for processing images which have been produced by scanning
|
316
|
+
pages with gray pictures in color mode and haven’t been previously converted
|
317
|
+
to grayscale. Such a situation may often occur, for example, when processing
|
318
|
+
digital photos with ScanTailor.</p></dd>
|
319
|
+
|
320
|
+
<p>When pdfbeads loads previously produced background image from the hard
|
321
|
+
drive, it doesn’t perform any additional processing. JPEG and JPEG2000
|
322
|
+
images are inserted into the resulting PDF file “as is”, while images
|
323
|
+
taken from TIFF and PNG files are compressed with the deflate method.
|
324
|
+
However if there are several <tt>*.bg.*</tt> or <tt>*.sep.*</tt> files
|
325
|
+
which have the same base name, but different extensions, then the graphical
|
326
|
+
format specified with the <tt>--bg-compression</tt> option will take
|
327
|
+
precedence.</p>
|
328
|
+
|
329
|
+
</dl>
|
330
|
+
|
331
|
+
<h2>Separating color images using a mask image</h2>
|
332
|
+
|
333
|
+
<p>Separating a scanned image into distinct layers is especially difficult
|
334
|
+
in case the text has been printed above a picture or texture. In order to
|
335
|
+
effectively package such a page into a pdf file one should prepare two
|
336
|
+
graphical files:</p>
|
337
|
+
|
338
|
+
<ul>
|
339
|
+
|
340
|
+
<li><p>a bitonal or indexed image containing just the scanned text or any
|
341
|
+
other elements supposed to be placed into the foreground layer;</p></li>
|
342
|
+
|
343
|
+
<li><p>a color scan of the same page (pdfbeads recognizes such images
|
344
|
+
by the <tt>*.color.*</tt> filename suffix).</p></li>
|
345
|
+
|
346
|
+
</ul>
|
347
|
+
|
348
|
+
<p>The first file will serve a stencil: basing on its shapes pdfbeads will
|
349
|
+
attempt to produce from the color scan two new images, so that the first
|
350
|
+
one (with the <tt>*.bg.*</tt> suffix) will contain just the color
|
351
|
+
background cleaned up from any text data, while on the second one (with the
|
352
|
+
<tt>*.fg.*</tt> suffix) just the mask elements with the corresponding
|
353
|
+
texture will remain. This procedure is very similar to one performed by
|
354
|
+
the <tt>djvumake</tt> when we run it with the <tt>PPM</tt> option.
|
355
|
+
In either case the purpose is to produce a 3-layered page where the first
|
356
|
+
color layer is responsible for the image background while the second one
|
357
|
+
is used to specify colors and textures for the mask which is placed
|
358
|
+
above.</p>
|
359
|
+
|
360
|
+
<p>In order to achieve the desired result it is necessary that the mask
|
361
|
+
can be placed above the color images without any shifts or distortions.
|
362
|
+
On the other hand, it is OK if the two images have different resolutions
|
363
|
+
(and thus different pixel sizes): in such a case pdfbeads will first resize
|
364
|
+
the stencil so that it matches the size of the color image. Note that, if
|
365
|
+
all the text at the page is black (or at least darker than the background),
|
366
|
+
it would be convenient to use ScanTailor for producing both the source
|
367
|
+
graphical files. In order to do that one should output the same page first
|
368
|
+
as “Black and White” and then as “Color/Grayscale”.</p>
|
369
|
+
|
370
|
+
<p>Also note that, if the stencil image is represented with an indexed
|
371
|
+
(but not bitolnal) image with the number of colors equal to or less than
|
372
|
+
the current value of the <tt>--max-colors</tt> options, then pdfbeads
|
373
|
+
will not create a <tt>*.fg.*</tt> file: instead it will just place
|
374
|
+
the stencil with the previously specified colors above the background
|
375
|
+
layer cleaned up from the text data.</p>
|
376
|
+
|
377
|
+
<p>To conclude this section I’d like to mention that the segmentation
|
378
|
+
algorithm used by pdfbeads has been inspired by
|
379
|
+
<a href="http://www.imagemagick.org/discourse-server/viewtopic.php?p=41498#p41498">a
|
380
|
+
thread at the ImageMagick forum</a>, where possible methods to remove text
|
381
|
+
from an image and then fill the resulting “gaps” basing on the values of the
|
382
|
+
neighboring pixels have been discussed.</p>
|
383
|
+
|
384
|
+
<h2>Additional features</h2>
|
385
|
+
|
386
|
+
<h3>Adding metadata</h3>
|
387
|
+
|
388
|
+
<p>In order to include some information about author, book title etc.
|
389
|
+
into the PDF file going to be produced by pdfbeads, one should first put
|
390
|
+
those data into a special ASCII or UTF-8 encoded text file. Each line
|
391
|
+
of the file should be formatted as follows:</p>
|
392
|
+
|
393
|
+
<pre>/<KEYWORD>: "Some text"
|
394
|
+
</pre>
|
395
|
+
|
396
|
+
<p>The following keyword strings are currently recognized by pdfbeads:
|
397
|
+
<tt>Title</tt>, <tt>Author</tt>, <tt>Subject</tt> and <tt>Keywords</tt>.
|
398
|
+
Any lines starting with the `#' character are considered comments and
|
399
|
+
ignored.</p>
|
400
|
+
|
401
|
+
<p>A reference to the metadata file can be passed to pdfbeads via the
|
402
|
+
<tt>-M</tt> (or <tt>--meta</tt>) option.</p>
|
403
|
+
|
404
|
+
<h3>Page labels</h3>
|
405
|
+
|
406
|
+
<p>pdfbeads allows to generate page labels which may be then displayed by
|
407
|
+
a PDF viewer instead of physical page numbers. Thus it is possible
|
408
|
+
to bring page numbering of the electronic document into accordance with
|
409
|
+
the pagination of the paper book. Page labels may be specified with the
|
410
|
+
<tt>-L</tt> (or <tt>--labels</tt>) command line key. This option takes
|
411
|
+
an argument which should be enclosed into quotation marks and may contain
|
412
|
+
one or more numbering range specifications, separated with semicolons.</p>
|
413
|
+
|
414
|
+
<p>A numbering range is constructed from the following components (each of
|
415
|
+
them being optional):</p>
|
416
|
+
|
417
|
+
<ul>
|
418
|
+
<li><p>The physical number of the first page of the given range in the
|
419
|
+
PDF document, separated with a colon from the rest of the specification.
|
420
|
+
Note that pages in PDF documents are numbered starting from zero, so for
|
421
|
+
the first range this value should always be zero.</p></li>
|
422
|
+
|
423
|
+
<li><p>An arbitrary numbering prefix (any characters, except a double
|
424
|
+
quotation mark, a colon, a semicolon, and a percent sign are allowed
|
425
|
+
here).</p></li>
|
426
|
+
|
427
|
+
<li><p>A numbering format description, which starts from a percent sign
|
428
|
+
followed by a single Latin letter corresponding to a particular numbering
|
429
|
+
style:</p></li>
|
430
|
+
|
431
|
+
<dl>
|
432
|
+
|
433
|
+
<dt>D</dt>
|
434
|
+
<dd><p>arabic digits;</p></dd>
|
435
|
+
|
436
|
+
<dt>R</dt>
|
437
|
+
<dd><p>uppercase Roman numerals;</p></dd>
|
438
|
+
|
439
|
+
<dt>r</dt>
|
440
|
+
<dd><p>lowercase Roman numerals;</p></dd>
|
441
|
+
|
442
|
+
<dt>A</dt>
|
443
|
+
<dd><p>uppercase Latin letters;</p></dd>
|
444
|
+
|
445
|
+
<dt>a</dt>
|
446
|
+
<dd><p>lowercase Latin letters.</p></dd>
|
447
|
+
|
448
|
+
</dl>
|
449
|
+
|
450
|
+
<p>Between the percent sign and the numbering format identifier it is
|
451
|
+
possible to put an arbitrary number, thus setting the number to be
|
452
|
+
displayed for the first page of the given range (1 by default).</p>
|
453
|
+
|
454
|
+
</ul>
|
455
|
+
|
456
|
+
<p>Suppose for example that a book starts from two unnumbered title pages
|
457
|
+
followed by 32 pages numbered with roman digits. Then goes an arabic
|
458
|
+
pagination, which, however, starts straight from 33. So the following argument
|
459
|
+
for the <tt>--labels</tt> option would be appropriate:</p>
|
460
|
+
|
461
|
+
<pre>
|
462
|
+
"0:Title %D;2:%R;34:%33D"
|
463
|
+
</pre>
|
464
|
+
|
465
|
+
<h3>Building table of contents</h3>
|
466
|
+
|
467
|
+
<p>pdfbeads allows to add table of contents (PDF bookmarks) to the PDF file
|
468
|
+
to be generated. This is done with the <tt>-C</tt> (or <tt>--toc</tt>) option,
|
469
|
+
which accepts as an argument a path to a text file.</p>
|
470
|
+
|
471
|
+
<p>The TOC file should be UTF-8 encoded and consist of lines formatted
|
472
|
+
as follows (lines beginning from the `#' character are considered comments
|
473
|
+
and ignored):</p>
|
474
|
+
|
475
|
+
<pre>
|
476
|
+
<indent>"Heading" "Page number" [0|-|1|+]
|
477
|
+
</pre>
|
478
|
+
|
479
|
+
<p>The heading level is determined by its indent (which may be formed either
|
480
|
+
from spaces or from tabs, but mixing both styles inside the same file is not
|
481
|
+
allowed). The indent is followed by the fields of heading and page number,
|
482
|
+
which are separated with any number of space characters and may be enclosed
|
483
|
+
into double quotation marks if mecessary. The last optional parameter
|
484
|
+
specifies, if this TOC entry should be displayed unfolded by default (the
|
485
|
+
characters `+' and `1' mean “yes”).</p>
|
486
|
+
|
487
|
+
<p>It is a good idea to use the <tt>--toc</tt> option together with
|
488
|
+
<tt>--labels</tt>. Thus it is possible to use in the TOC file the same
|
489
|
+
page numbers as in the paper book without taking care about any shifts
|
490
|
+
of the numbering.</p>
|
491
|
+
|
492
|
+
<h3>Adding text layer</h3>
|
493
|
+
|
494
|
+
<p>It is possible with pdfbeads to create PDF files with a hidden text
|
495
|
+
layer. The text for a hidden layer may be either obtained from
|
496
|
+
<a href="http://docs.google.com/View?docid=dfxcv4vc_67g844kf">hOCR</a>
|
497
|
+
files (hOCR is a HTML language extension, allowing to store information
|
498
|
+
about exact positioning of characters and markup elements on the page) or
|
499
|
+
imported from another PDF file.</p>
|
500
|
+
|
501
|
+
<p>Для создания файлов в формате hOCR необходимо воспользоваться программой
|
502
|
+
оптического распознавания символов, поддерживающей этот формат, например
|
503
|
+
<a href="https://launchpad.net/cuneiform-linux/">Cuneiform</a> или
|
504
|
+
<a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>.
|
505
|
+
Распознанный текст следует сохранить в той же директории, что и остальные
|
506
|
+
файлы, относящиеся к проекту. При этом каждой распознанной странице должен
|
507
|
+
соответствовать отдельный файл с тем же базовым именем, что и у исходного
|
508
|
+
изображения, при расширении HTM(L) или HOCR. Обработка файлов hOCR
|
509
|
+
осуществляется автоматически при условии, что интерпретатору Ruby доступно
|
510
|
+
расширение Nokogiri.</p>
|
511
|
+
|
512
|
+
<p>Иное возможное решение заключается в том, чтобы импортировать текстовый
|
513
|
+
слой из другого PDF-файла (естественно, последний должен быть получен путем
|
514
|
+
распознавания тех же самых изображений, которые предполагается затем обработать
|
515
|
+
с помощью pdfbeads). Имя полученного файла следует передать pdfbeads с помощью
|
516
|
+
ключа <tt>-T</tt> (полная форма — <tt>-text-pdf</tt>). Эта
|
517
|
+
возможность особенно важна в тех случаях, когда приходится использовать для
|
518
|
+
распознавания текста коммерческое приложение (например,
|
519
|
+
<a href="http://www.abbyy.ru/finereader/">ABBYY Finereader</a>), в котором
|
520
|
+
не предусмотрена поддержка формата hOCR. <strong>Внимание:</strong> при
|
521
|
+
создании промежуточного PDF-файла в ABBYY Finereader следует использовать
|
522
|
+
настройки «текст под изображением» или «текст поверх
|
523
|
+
изображения», поскольку при иных настройках размещение символов на
|
524
|
+
странице может оказаться не вполне соответствующим исходному графическому
|
525
|
+
файлу.</p>
|
526
|
+
|
527
|
+
<h3>Processing files with the right-to-left text direction</h3>
|
528
|
+
|
529
|
+
<p>The <tt>-R</tt> (or <tt>--right-to-left</tt> option allows to mark the
|
530
|
+
PDF file produced by pdfbeads with a special flag indicating that the main
|
531
|
+
text direction for the given document is right-to-left. This flag will allow
|
532
|
+
Adobe Reader™ to correctly order pages when displaying them in the
|
533
|
+
side-by-side mode.</p>
|
534
|
+
|
535
|
+
<h2>License</h2>
|
536
|
+
|
537
|
+
<p>This program is free software; you can redistribute it and/or modify
|
538
|
+
it under the terms of the GNU General Public License as published by
|
539
|
+
the Free Software Foundation; either version 2 of the License, or
|
540
|
+
(at your option) any later version.</p>
|
541
|
+
|
542
|
+
<p>This program is distributed in the hope that it will be useful,
|
543
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
544
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
545
|
+
GNU General Public License for more details.</p>
|
546
|
+
|
547
|
+
<p>You should have received a copy of the GNU General Public License
|
548
|
+
along with this program; if not, write to the Free Software
|
549
|
+
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.</p>
|
550
|
+
|
551
|
+
</body>
|
552
|
+
</html>
|