pdfbeads 1.0.9 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ChangeLog +23 -0
- data/bin/pdfbeads +28 -3
- data/doc/pdfbeads.en.html +552 -0
- data/doc/pdfbeads.ru.html +74 -34
- data/lib/pdfbeads.rb +17 -6
- data/lib/pdfbeads/pdfbuilder.rb +254 -74
- data/lib/pdfbeads/pdfpage.rb +8 -8
- data/lib/pdfbeads/pdftoc.rb +7 -3
- metadata +80 -48
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 577a47277a3e474a47b740ce93ca89041402c4fb
|
4
|
+
data.tar.gz: 88a1f950e31e41e47f79ef978d9e589e1b9724fb
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 16ab45bd0c4d63d7c6f567f3df5e2dea2ecc1e9237ac9b5acfc9cd92e31a72a8df7ad184e5c7fd779cb990f58ff64dbb3027464c7579fd79d58fc270ac01c841
|
7
|
+
data.tar.gz: b946c901407f06aeaf509988341759ba4f4cfc217067b00fb72ad5bc9ae60e4d1dd0729bb7a3761a6c06f71816dcb9398e2cde4335ee8a5620c1686e208fe901
|
data/ChangeLog
CHANGED
@@ -55,3 +55,26 @@
|
|
55
55
|
* Don't attempt to use 'ocrx_word' elements which contain no bounding box
|
56
56
|
data (this should fix the problem with the hOCR output produced by some
|
57
57
|
tesseract versions).
|
58
|
+
|
59
|
+
2013 Mar 20 (Alexey Kryukov) Version 1.1.0
|
60
|
+
|
61
|
+
+ It is now possible to take the text layer from another PDF document (normally
|
62
|
+
this would be a file produced by passing the same set of images to an
|
63
|
+
OCR application) and embed it into the pdfbeads output. Warning: this feature
|
64
|
+
has been tested so far only with files produced with ABBYY FineReader. It may or
|
65
|
+
may not work with PDF files generated by other OCR programs.
|
66
|
+
|
67
|
+
* The default PDF page layout is now "OneColumn".
|
68
|
+
|
69
|
+
+ Make it possible to specify that the preferred reading direction for the
|
70
|
+
PDF document is left-to-right.
|
71
|
+
|
72
|
+
+ In order to simplify debugging of resulting files I have added a special
|
73
|
+
flag allowing to make the hidden text layer visible and to disable
|
74
|
+
compression in page streams.
|
75
|
+
|
76
|
+
2014 Jan 26 (Alexey Kryukov) Version 1.1.1
|
77
|
+
|
78
|
+
* hpricot is no longer developed, so switch to Nokagiri for hOCR processing.
|
79
|
+
|
80
|
+
+ English HTML documentation added.
|
data/bin/pdfbeads
CHANGED
@@ -41,9 +41,12 @@ include PDFBeads
|
|
41
41
|
pdfargs = Hash[
|
42
42
|
:labels => nil,
|
43
43
|
:toc => nil,
|
44
|
-
:pagelayout => '
|
44
|
+
:pagelayout => 'OneColumn',
|
45
45
|
:meta => nil,
|
46
|
-
:
|
46
|
+
:textpdf => nil,
|
47
|
+
:delfiles => false,
|
48
|
+
:debug => false,
|
49
|
+
:rtl => false
|
47
50
|
]
|
48
51
|
pageargs = Hash[
|
49
52
|
:threshold => 1,
|
@@ -87,6 +90,23 @@ OptionParser.new() do |opts|
|
|
87
90
|
|
88
91
|
pdfargs[:pagelayout] = pagelayout
|
89
92
|
end
|
93
|
+
opts.on("-R", "--right-to-left",
|
94
|
+
"Set the flag indicating that the preferred reading",
|
95
|
+
"direction for the resulting PDF file is right to left") do |rtl|
|
96
|
+
pdfargs[:rtl] = rtl
|
97
|
+
end
|
98
|
+
opts.on("-T", "--text-pdf PDFFILE",
|
99
|
+
"Specify a PDF file produced by passing the same set",
|
100
|
+
"of files to an OCR program. Pdfbeads will use that file",
|
101
|
+
"to generate the hidden text layer for its PDF output.") do |pdffile|
|
102
|
+
|
103
|
+
if $has_pdfreader
|
104
|
+
pdfargs[:textpdf] = pdffile
|
105
|
+
else
|
106
|
+
$stderr.puts( "Warning: the pdf/reader extension is not available." )
|
107
|
+
$stderr.puts( "\tthe -T/--text-pdf option is ignored." )
|
108
|
+
end
|
109
|
+
end
|
90
110
|
|
91
111
|
opts.separator "\n"
|
92
112
|
opts.separator "Image encoding and compression options:\n"
|
@@ -147,7 +167,7 @@ OptionParser.new() do |opts|
|
|
147
167
|
"Compression method for background images. Acceptable",
|
148
168
|
"values are JP2|JPX|JPEG2000, JPG|JPEG or PNG|LOSSLESS.",
|
149
169
|
"JP2 is used by default, unless this format is not",
|
150
|
-
"supported by the available version
|
170
|
+
"supported by the available ImageMagick version" ) do |format|
|
151
171
|
case format.upcase
|
152
172
|
when 'JP2', 'JPX', 'J2K', 'JPEG2000'
|
153
173
|
pageargs[:bg_format] = 'JP2'
|
@@ -178,6 +198,11 @@ OptionParser.new() do |opts|
|
|
178
198
|
"Print output to a file instead of STDERR") do |f|
|
179
199
|
outpath = f
|
180
200
|
end
|
201
|
+
opts.on("-D", "--debug",
|
202
|
+
"Simplify debugging the PDF output by making the hidden",
|
203
|
+
"text layer visible and using uncompressed page streams") do |dbg|
|
204
|
+
pdfargs[:debug] = dbg
|
205
|
+
end
|
181
206
|
opts.on_tail("-h", "--help", "Show this message") do
|
182
207
|
puts opts
|
183
208
|
exit
|
@@ -0,0 +1,552 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
|
5
|
+
<title>PDFBeads -- convert scanned images to a single PDF file</title>
|
6
|
+
|
7
|
+
<meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
|
8
|
+
|
9
|
+
<meta name="Generator" content="Written directly in html">
|
10
|
+
|
11
|
+
<meta name="Description" content="pdfbeads v. 1.1 User's Manual">
|
12
|
+
|
13
|
+
<style type="text/css">
|
14
|
+
body {
|
15
|
+
font-family: Times New Roman, Times, serif;
|
16
|
+
text-align: justify;
|
17
|
+
}
|
18
|
+
a:link {
|
19
|
+
color: blue; text-decoration: underline
|
20
|
+
}
|
21
|
+
a:hover {
|
22
|
+
color: fuchsia
|
23
|
+
}
|
24
|
+
a:active {
|
25
|
+
color: fuchsia
|
26
|
+
}
|
27
|
+
a:visited {
|
28
|
+
color: purple
|
29
|
+
}
|
30
|
+
h1 {
|
31
|
+
font-size: 36px;
|
32
|
+
font-family: Times New Roman, Times, serif;
|
33
|
+
text-align: center;
|
34
|
+
font-style: normal;
|
35
|
+
font-weight: bold
|
36
|
+
}
|
37
|
+
h2 {
|
38
|
+
font-size: 20px;
|
39
|
+
font-family: Arial, Helvetica, sans-serif;
|
40
|
+
text-align: center;
|
41
|
+
font-style: normal;
|
42
|
+
font-weight: bold;
|
43
|
+
}
|
44
|
+
h3 {
|
45
|
+
font-size: 16px;
|
46
|
+
font-family: Arial, Helvetica, sans-serif;
|
47
|
+
text-align: left;
|
48
|
+
font-style: italic;
|
49
|
+
font-weight: bold;
|
50
|
+
}
|
51
|
+
dt {
|
52
|
+
font-weight: bold;
|
53
|
+
}
|
54
|
+
</style>
|
55
|
+
|
56
|
+
</head>
|
57
|
+
|
58
|
+
<body>
|
59
|
+
|
60
|
+
<h1>pdfbeads v. 1.1 User's Manual</h1>
|
61
|
+
|
62
|
+
<p>(c) Alexey Kryukov, 2013</p>
|
63
|
+
|
64
|
+
<p>pdfbeads is a small utility intended for creating e-books in PDF format
|
65
|
+
from specially prepared scanned pages. Unlike other similar utilities,
|
66
|
+
pdfbeads attempts to implement the approach more commonly used for DjVu files.
|
67
|
+
Its key feature is separating the scanned page into distinct layers, each
|
68
|
+
layer having its own compression format and resolution.</p>
|
69
|
+
|
70
|
+
<p>Here are some features of pdfbeads:</p>
|
71
|
+
|
72
|
+
<ul>
|
73
|
+
|
74
|
+
<li><p>JBIG2 and JPEG2000 graphical data compression;</p></li>
|
75
|
+
|
76
|
+
<li><p>separating out text and image layers from “mixed” files produced with
|
77
|
+
<a href="http://scantailor.sourceforge.net/">ScanTailor</a>;</p></li>
|
78
|
+
|
79
|
+
<li><p>adding halftone background images to text pages previously
|
80
|
+
converted to B&W;</p></li>
|
81
|
+
|
82
|
+
<li><p>processing indexed images with a limitedо (small) number of colors
|
83
|
+
so that the colors are preserved and the content is placed into the foreground
|
84
|
+
layer;</p></li>
|
85
|
+
|
86
|
+
<li><p>separating out background and foreground data by masking a source
|
87
|
+
color image;</p></li>
|
88
|
+
|
89
|
+
<li><p>producing PDF files with a table of contents and metadata;</p></li>
|
90
|
+
|
91
|
+
<li><p>adding a hidden text layer produced from previously generated hOCR
|
92
|
+
files or transferring the text data from another PDF file.</p></li>
|
93
|
+
|
94
|
+
</ul>
|
95
|
+
|
96
|
+
<p>The program has been called pdfbeads because building an e-book from
|
97
|
+
separate graphical data seems to me a bit similar to threading beads on a
|
98
|
+
string. Moreover, this name seems appropriate for a Ruby script, since
|
99
|
+
it has something to do with jewelry where rubies (like other gems) are
|
100
|
+
used.</p>
|
101
|
+
|
102
|
+
<h2>Requirements</h2>
|
103
|
+
|
104
|
+
<p>In order to run the program you’ll need first the Ruby runtime
|
105
|
+
environment v. 1.8 or above, which is available by default on most Unix-like
|
106
|
+
systems. A Windows installable package is available at the
|
107
|
+
<a href="http://www.rubyinstaller.org/">RubyInstaller</a> site. You will
|
108
|
+
also need RubyGems (the standard Ruby package manager) and some extensions
|
109
|
+
available (like pdfbeads itself) via the RubyGems framework, namely RMagick,
|
110
|
+
Nokogiri and PDF::Reader. Note that pdfbeads will may even if the last two
|
111
|
+
packages are not available. However, without Nokogiri it will not be possible
|
112
|
+
to read optically recognized text from hOCR files, while PDF::Reader is needed
|
113
|
+
in order to be able to import the text layer from another PDF file.</p>
|
114
|
+
|
115
|
+
<p>If you are interested in creating PDF files using the JBIG2 data compression
|
116
|
+
format, then your system should have also the jbig2 utility from the
|
117
|
+
<a href="http://github.com/agl/jbig2enc">jbig2enc</a> package installed.</p>
|
118
|
+
|
119
|
+
<h2>Installation</h2>
|
120
|
+
|
121
|
+
<p>Downloading and installing the most recent pdfbeads version with the
|
122
|
+
RubyGems package manager is quite simple. Just type in your command line:</p>
|
123
|
+
|
124
|
+
<pre>
|
125
|
+
gem install pdfbeads
|
126
|
+
</pre>
|
127
|
+
|
128
|
+
<p>Before running the script you should ensure the RMagick extension
|
129
|
+
is installed and accessible to the Ruby runtime. <strong>Unfortunately,
|
130
|
+
it is not currently possible to automatically resolve this dependency</strong>,
|
131
|
+
since in some Linux distributions (Ubuntu in particular) the standard
|
132
|
+
installation of the RMagick package circumvents the RubyGems engine, so that
|
133
|
+
the <tt>gem</tt> utility knows nothing about it.</p>
|
134
|
+
|
135
|
+
<p>Ubuntu users should also take into account that in this distribution
|
136
|
+
executable files from gem packages are unpacked into the
|
137
|
+
<tt>/var/lib/gems/<RUBY_VERSION>/bin</tt> directory, which is not
|
138
|
+
included into the PATH environment variable by default. So in order to be
|
139
|
+
able to run pdfbeads without specifying the full path to the executable file
|
140
|
+
you should either modify the PATH variable as necessary, or move the
|
141
|
+
<tt>pdfbeads</tt> script into some directory normally used for executables
|
142
|
+
(<tt>/usr/local/bin</tt> for example).</p>
|
143
|
+
|
144
|
+
<h2>Basic principles</h2>
|
145
|
+
|
146
|
+
<p>pdfbeads workflow is based upon making a difference between a “main” image
|
147
|
+
representing a core of the PDF page and various auxiliary files related
|
148
|
+
with the current page.</p>
|
149
|
+
|
150
|
+
<p>Those files which contain scanned text supposed to be placed into the
|
151
|
+
foreground layer are considered “main”. Images used for this purpose
|
152
|
+
should normally be previously converted to bitonal. pdfbeads is also
|
153
|
+
able to process indexed images with a limited number of colors and a white
|
154
|
+
or transparent background, as well as “mixed” images where bitonal text is
|
155
|
+
combined with halftone pictures. The latter feature is most useful for
|
156
|
+
postprocessing files produced with
|
157
|
+
<a href="http://scantailor.sourceforge.net/">ScanTailor</a>.</p>
|
158
|
+
|
159
|
+
<p>A special treatment is applied to files with double extension, where
|
160
|
+
one of the following suffixes precedes an extension typical for a common
|
161
|
+
graphical format (TIF(F), PNG, JP(E)G, JP2 or JPX):</p>
|
162
|
+
|
163
|
+
<dl>
|
164
|
+
|
165
|
+
<dt>bg or sep</dt>
|
166
|
+
<dd><p>A background image (halftone or indexed);</p></dd>
|
167
|
+
|
168
|
+
<dt>fg</dt>
|
169
|
+
<dd><p>An image supposed to be used to color the foreground layer (like
|
170
|
+
a FG44 chunk in a DJVU file);</p></dd>
|
171
|
+
|
172
|
+
<dt>color</dt>
|
173
|
+
<dd><p>A color image supposed to be used as a source for producing
|
174
|
+
images with the <tt>*.bg.*</tt> and <tt>*.fg.*</tt> suffixes;</p></dd>
|
175
|
+
|
176
|
+
<dt>An RGB color specification (e. g. <tt>black</tt> or <tt>#ff00ff</tt>)</dt>
|
177
|
+
<dd><p>A bitonal image supposed to be displayed with the given color in the
|
178
|
+
target PDF file.</p></dd>
|
179
|
+
|
180
|
+
</dl>
|
181
|
+
|
182
|
+
<p>Furthermore, if the current directory contains any hOCR files with
|
183
|
+
recognized text (their extensions should be either HTM(L) or HOCR),
|
184
|
+
pdfbeads will attempt to use them for building hidden text layer
|
185
|
+
in its PDF output.</p>
|
186
|
+
|
187
|
+
<p>Some of the auxiliary files which have been mentioned above may be
|
188
|
+
produced by pdfbeads during an intermediate stage of its work. Since
|
189
|
+
processing images with the ImageMagick library (which pdfbeads is based on)
|
190
|
+
can take quite a long time, those files are not removed afterwards from
|
191
|
+
the hard drive and may be reused on subsequent runs for time saving.
|
192
|
+
In order to force pdfbeads to recreate those files you may run it with the
|
193
|
+
<tt>-f</tt> (or <tt>--force-update</tt>) option.</p>
|
194
|
+
|
195
|
+
<p>pdfbeads is supposed to be used for building PDF files from previously
|
196
|
+
processed scanned images, and this is the reason which explains some
|
197
|
+
of its features and limitations:</p>
|
198
|
+
|
199
|
+
<ul>
|
200
|
+
|
201
|
+
<li><p>it is not possible to somehow modify scanned images of text pages
|
202
|
+
(except forcing them to a specific DPI value), for they are supposed to
|
203
|
+
be created with the settings the user would like to get, so that there is
|
204
|
+
no need to additionally process them;</p></li>
|
205
|
+
|
206
|
+
<li><p>it is not possible as well to convert color or grayscale scanned images
|
207
|
+
to bitonal. The only exception is those situations (like splitting “mixed”
|
208
|
+
pages where bitonal text areas are combined with halftone pictures or separating out
|
209
|
+
background and foreground data from a source color image by applying a mask)
|
210
|
+
where pdfbeads just finishes the job started by some other applications;</p></li>
|
211
|
+
|
212
|
+
<li><p>any background images taken directly from user’s hard drive are
|
213
|
+
encoded “as is” without any additional processing.</p></li>
|
214
|
+
|
215
|
+
</ul>
|
216
|
+
|
217
|
+
<h2>Getting started</h2>
|
218
|
+
|
219
|
+
<p>The generic command line syntax is as follows:</p>
|
220
|
+
|
221
|
+
<pre>
|
222
|
+
pdfbeads [options] [files to process] [> output_file.pdf]
|
223
|
+
</pre>
|
224
|
+
|
225
|
+
<p>The list of files to be processed may be either obtained from the
|
226
|
+
current directory listing or directly specified in the command line.
|
227
|
+
However in both cases <strong>pdfbeads accepts for processing only those
|
228
|
+
files whose names match a specific pattern:</strong> the extension should
|
229
|
+
be either TIF(F) or PNG (the case doesn’t matter) and there should be
|
230
|
+
no dots inside the base name (i. e. double extensions are not
|
231
|
+
allowed). The reason for this limitation is that the program, as explained
|
232
|
+
above, uses dot-separated file name suffixes to denote some types
|
233
|
+
of auxiliary files, accompanying the scanned text page itself.</p>
|
234
|
+
|
235
|
+
<p>Instead of writing the resulting PDF file to the standard output
|
236
|
+
stream one can use the <tt>-o</tt> (or <tt>--output</tt>) option
|
237
|
+
followed by the name of the file to be created.</p>
|
238
|
+
|
239
|
+
<h2>Processing bitonal images</h2>
|
240
|
+
|
241
|
+
<p>The foreground layer of a PDF page, or its “mask”, is created from the
|
242
|
+
“main” scanned page file passed to pdfbeads. The following rules are applied
|
243
|
+
here:</p>
|
244
|
+
|
245
|
+
<ul>
|
246
|
+
|
247
|
+
<li><p>TIFF or PNG images already converted to bitonal are used “as
|
248
|
+
is”;</p></li>
|
249
|
+
|
250
|
+
<li><p>pages with mixed content are cleared from any halftone pictures
|
251
|
+
(their processing is described in the next section), while the remaining
|
252
|
+
bitonal image is saved into a file with the same base name and the
|
253
|
+
<tt>black.tiff</tt> extension. That’s the <tt>black.tiff</tt> image file
|
254
|
+
which is further used to produce the foreground layer for such a page;</p></li>
|
255
|
+
|
256
|
+
<li><p>indexed images with a white or transparent background which contain
|
257
|
+
a small number of colors (4 by default; this value can be changed via the
|
258
|
+
<tt>-x</tt> (<tt>--max-colors</tt>) option) are splitted into several
|
259
|
+
bitonal image files according to the number of colors. Each of those
|
260
|
+
files is further encoded separately, so that the resulting PDF page
|
261
|
+
will have several foreground layers, each with its own color. NB: use an
|
262
|
+
indexed PNG image with a transparent background if you want to produce
|
263
|
+
a PDF page with a white-colored text.</p></li>
|
264
|
+
|
265
|
+
</ul>
|
266
|
+
|
267
|
+
<p>It is recommended to encode bitonal text pages as CCITT Group 4 fax
|
268
|
+
compressed TIFF files, since pdfbeads is usually able to read the image
|
269
|
+
data from such files without using the ImageMagick library, thus making
|
270
|
+
the processing speed significantly faster.</p>
|
271
|
+
|
272
|
+
<p>By default pdfbeads attempts to apply JBIG2 compression to the foreground
|
273
|
+
layer, using Adam Langley’s <a href="http://github.com/agl/jbig2enc">jbig2enc</a>
|
274
|
+
utility. You can run pdfbeads with the <tt>-p</tt> (<tt>--pages-per-dict</tt>)
|
275
|
+
option in order to directly specify the desired number of PDF document
|
276
|
+
pages using the common dictionary of shared symbols (15 by default).</p>
|
277
|
+
|
278
|
+
<p>If jbig2enc is not accessible to pdfbeads, then the CCITT Group 4 fax
|
279
|
+
compression method will be used instead. It is also possible to explicitly
|
280
|
+
request this compression type by specifying the <tt>-m</tt>
|
281
|
+
(<tt>--mask-compression</tt>) option with the `G4' parameter (or its synonyms:
|
282
|
+
`Group4', `CCITTFax').</p>
|
283
|
+
|
284
|
+
<h2>Processing halftone images</h2>
|
285
|
+
|
286
|
+
<p>Halftone images are placed into the background layer of a PDF page.
|
287
|
+
This layer is normally supposed to have a lower resolution than its mask.
|
288
|
+
pdfbeads can either take a background image directly from the hard drive
|
289
|
+
(i. e. from a file with a <strong>bg</strong> or <strong>sep</strong>
|
290
|
+
extension suffix), or produce it by splitting a mixed image file.</p>
|
291
|
+
|
292
|
+
<p>When processing mixed image files pdfbeads first separates pictures
|
293
|
+
from text areas. This is achieved by filling any black pixels with white.
|
294
|
+
The resulting image is saved into the hard drive by a such thay, that the
|
295
|
+
following commkand line options are taken into account:</p>
|
296
|
+
|
297
|
+
<dl>
|
298
|
+
<dt>-b, --bg-compression</dt>
|
299
|
+
<dd><p>The data compression format. The fololowing values are allowed:
|
300
|
+
`JPEG2000' (also `JP2' also `JPX'), `JPEG' (or `JPG') and `LOSSLESS'
|
301
|
+
(synonyms are `DEFLATE' and `PNG'). pdfbeads will attempt to use the JPEG2000
|
302
|
+
compression by default. However it falls back to JPEG if JPEG2000 format
|
303
|
+
is not supported by the currently used ImageMagick build (which is often
|
304
|
+
the case). If the option has been set to LOSSLESS, then pdfbeads will
|
305
|
+
compress background images with the deflate method. Of course this choice
|
306
|
+
would normally result into producing a much larger output file than with
|
307
|
+
JPEG2000 or JPEG.</p></dd>
|
308
|
+
|
309
|
+
<dt>-B, --bg-resolution DPI</dt>
|
310
|
+
<dd><p>The resolution for the background. Reasonable values usually lie
|
311
|
+
between 150 and 300 dpi (300 by default).</p></dd>
|
312
|
+
|
313
|
+
<dt>-g, --grayscale</dt>
|
314
|
+
<dd><p>Forces pdfbeads to convert color images into grayscale. This option
|
315
|
+
would be useful for processing images which have been produced by scanning
|
316
|
+
pages with gray pictures in color mode and haven’t been previously converted
|
317
|
+
to grayscale. Such a situation may often occur, for example, when processing
|
318
|
+
digital photos with ScanTailor.</p></dd>
|
319
|
+
|
320
|
+
<p>When pdfbeads loads previously produced background image from the hard
|
321
|
+
drive, it doesn’t perform any additional processing. JPEG and JPEG2000
|
322
|
+
images are inserted into the resulting PDF file “as is”, while images
|
323
|
+
taken from TIFF and PNG files are compressed with the deflate method.
|
324
|
+
However if there are several <tt>*.bg.*</tt> or <tt>*.sep.*</tt> files
|
325
|
+
which have the same base name, but different extensions, then the graphical
|
326
|
+
format specified with the <tt>--bg-compression</tt> option will take
|
327
|
+
precedence.</p>
|
328
|
+
|
329
|
+
</dl>
|
330
|
+
|
331
|
+
<h2>Separating color images using a mask image</h2>
|
332
|
+
|
333
|
+
<p>Separating a scanned image into distinct layers is especially difficult
|
334
|
+
in case the text has been printed above a picture or texture. In order to
|
335
|
+
effectively package such a page into a pdf file one should prepare two
|
336
|
+
graphical files:</p>
|
337
|
+
|
338
|
+
<ul>
|
339
|
+
|
340
|
+
<li><p>a bitonal or indexed image containing just the scanned text or any
|
341
|
+
other elements supposed to be placed into the foreground layer;</p></li>
|
342
|
+
|
343
|
+
<li><p>a color scan of the same page (pdfbeads recognizes such images
|
344
|
+
by the <tt>*.color.*</tt> filename suffix).</p></li>
|
345
|
+
|
346
|
+
</ul>
|
347
|
+
|
348
|
+
<p>The first file will serve a stencil: basing on its shapes pdfbeads will
|
349
|
+
attempt to produce from the color scan two new images, so that the first
|
350
|
+
one (with the <tt>*.bg.*</tt> suffix) will contain just the color
|
351
|
+
background cleaned up from any text data, while on the second one (with the
|
352
|
+
<tt>*.fg.*</tt> suffix) just the mask elements with the corresponding
|
353
|
+
texture will remain. This procedure is very similar to one performed by
|
354
|
+
the <tt>djvumake</tt> when we run it with the <tt>PPM</tt> option.
|
355
|
+
In either case the purpose is to produce a 3-layered page where the first
|
356
|
+
color layer is responsible for the image background while the second one
|
357
|
+
is used to specify colors and textures for the mask which is placed
|
358
|
+
above.</p>
|
359
|
+
|
360
|
+
<p>In order to achieve the desired result it is necessary that the mask
|
361
|
+
can be placed above the color images without any shifts or distortions.
|
362
|
+
On the other hand, it is OK if the two images have different resolutions
|
363
|
+
(and thus different pixel sizes): in such a case pdfbeads will first resize
|
364
|
+
the stencil so that it matches the size of the color image. Note that, if
|
365
|
+
all the text at the page is black (or at least darker than the background),
|
366
|
+
it would be convenient to use ScanTailor for producing both the source
|
367
|
+
graphical files. In order to do that one should output the same page first
|
368
|
+
as “Black and White” and then as “Color/Grayscale”.</p>
|
369
|
+
|
370
|
+
<p>Also note that, if the stencil image is represented with an indexed
|
371
|
+
(but not bitolnal) image with the number of colors equal to or less than
|
372
|
+
the current value of the <tt>--max-colors</tt> options, then pdfbeads
|
373
|
+
will not create a <tt>*.fg.*</tt> file: instead it will just place
|
374
|
+
the stencil with the previously specified colors above the background
|
375
|
+
layer cleaned up from the text data.</p>
|
376
|
+
|
377
|
+
<p>To conclude this section I’d like to mention that the segmentation
|
378
|
+
algorithm used by pdfbeads has been inspired by
|
379
|
+
<a href="http://www.imagemagick.org/discourse-server/viewtopic.php?p=41498#p41498">a
|
380
|
+
thread at the ImageMagick forum</a>, where possible methods to remove text
|
381
|
+
from an image and then fill the resulting “gaps” basing on the values of the
|
382
|
+
neighboring pixels have been discussed.</p>
|
383
|
+
|
384
|
+
<h2>Additional features</h2>
|
385
|
+
|
386
|
+
<h3>Adding metadata</h3>
|
387
|
+
|
388
|
+
<p>In order to include some information about author, book title etc.
|
389
|
+
into the PDF file going to be produced by pdfbeads, one should first put
|
390
|
+
those data into a special ASCII or UTF-8 encoded text file. Each line
|
391
|
+
of the file should be formatted as follows:</p>
|
392
|
+
|
393
|
+
<pre>/<KEYWORD>: "Some text"
|
394
|
+
</pre>
|
395
|
+
|
396
|
+
<p>The following keyword strings are currently recognized by pdfbeads:
|
397
|
+
<tt>Title</tt>, <tt>Author</tt>, <tt>Subject</tt> and <tt>Keywords</tt>.
|
398
|
+
Any lines starting with the `#' character are considered comments and
|
399
|
+
ignored.</p>
|
400
|
+
|
401
|
+
<p>A reference to the metadata file can be passed to pdfbeads via the
|
402
|
+
<tt>-M</tt> (or <tt>--meta</tt>) option.</p>
|
403
|
+
|
404
|
+
<h3>Page labels</h3>
|
405
|
+
|
406
|
+
<p>pdfbeads allows to generate page labels which may be then displayed by
|
407
|
+
a PDF viewer instead of physical page numbers. Thus it is possible
|
408
|
+
to bring page numbering of the electronic document into accordance with
|
409
|
+
the pagination of the paper book. Page labels may be specified with the
|
410
|
+
<tt>-L</tt> (or <tt>--labels</tt>) command line key. This option takes
|
411
|
+
an argument which should be enclosed into quotation marks and may contain
|
412
|
+
one or more numbering range specifications, separated with semicolons.</p>
|
413
|
+
|
414
|
+
<p>A numbering range is constructed from the following components (each of
|
415
|
+
them being optional):</p>
|
416
|
+
|
417
|
+
<ul>
|
418
|
+
<li><p>The physical number of the first page of the given range in the
|
419
|
+
PDF document, separated with a colon from the rest of the specification.
|
420
|
+
Note that pages in PDF documents are numbered starting from zero, so for
|
421
|
+
the first range this value should always be zero.</p></li>
|
422
|
+
|
423
|
+
<li><p>An arbitrary numbering prefix (any characters, except a double
|
424
|
+
quotation mark, a colon, a semicolon, and a percent sign are allowed
|
425
|
+
here).</p></li>
|
426
|
+
|
427
|
+
<li><p>A numbering format description, which starts from a percent sign
|
428
|
+
followed by a single Latin letter corresponding to a particular numbering
|
429
|
+
style:</p></li>
|
430
|
+
|
431
|
+
<dl>
|
432
|
+
|
433
|
+
<dt>D</dt>
|
434
|
+
<dd><p>arabic digits;</p></dd>
|
435
|
+
|
436
|
+
<dt>R</dt>
|
437
|
+
<dd><p>uppercase Roman numerals;</p></dd>
|
438
|
+
|
439
|
+
<dt>r</dt>
|
440
|
+
<dd><p>lowercase Roman numerals;</p></dd>
|
441
|
+
|
442
|
+
<dt>A</dt>
|
443
|
+
<dd><p>uppercase Latin letters;</p></dd>
|
444
|
+
|
445
|
+
<dt>a</dt>
|
446
|
+
<dd><p>lowercase Latin letters.</p></dd>
|
447
|
+
|
448
|
+
</dl>
|
449
|
+
|
450
|
+
<p>Between the percent sign and the numbering format identifier it is
|
451
|
+
possible to put an arbitrary number, thus setting the number to be
|
452
|
+
displayed for the first page of the given range (1 by default).</p>
|
453
|
+
|
454
|
+
</ul>
|
455
|
+
|
456
|
+
<p>Suppose for example that a book starts from two unnumbered title pages
|
457
|
+
followed by 32 pages numbered with roman digits. Then goes an arabic
|
458
|
+
pagination, which, however, starts straight from 33. So the following argument
|
459
|
+
for the <tt>--labels</tt> option would be appropriate:</p>
|
460
|
+
|
461
|
+
<pre>
|
462
|
+
"0:Title %D;2:%R;34:%33D"
|
463
|
+
</pre>
|
464
|
+
|
465
|
+
<h3>Building table of contents</h3>
|
466
|
+
|
467
|
+
<p>pdfbeads allows to add table of contents (PDF bookmarks) to the PDF file
|
468
|
+
to be generated. This is done with the <tt>-C</tt> (or <tt>--toc</tt>) option,
|
469
|
+
which accepts as an argument a path to a text file.</p>
|
470
|
+
|
471
|
+
<p>The TOC file should be UTF-8 encoded and consist of lines formatted
|
472
|
+
as follows (lines beginning from the `#' character are considered comments
|
473
|
+
and ignored):</p>
|
474
|
+
|
475
|
+
<pre>
|
476
|
+
<indent>"Heading" "Page number" [0|-|1|+]
|
477
|
+
</pre>
|
478
|
+
|
479
|
+
<p>The heading level is determined by its indent (which may be formed either
|
480
|
+
from spaces or from tabs, but mixing both styles inside the same file is not
|
481
|
+
allowed). The indent is followed by the fields of heading and page number,
|
482
|
+
which are separated with any number of space characters and may be enclosed
|
483
|
+
into double quotation marks if mecessary. The last optional parameter
|
484
|
+
specifies, if this TOC entry should be displayed unfolded by default (the
|
485
|
+
characters `+' and `1' mean “yes”).</p>
|
486
|
+
|
487
|
+
<p>It is a good idea to use the <tt>--toc</tt> option together with
|
488
|
+
<tt>--labels</tt>. Thus it is possible to use in the TOC file the same
|
489
|
+
page numbers as in the paper book without taking care about any shifts
|
490
|
+
of the numbering.</p>
|
491
|
+
|
492
|
+
<h3>Adding text layer</h3>
|
493
|
+
|
494
|
+
<p>It is possible with pdfbeads to create PDF files with a hidden text
|
495
|
+
layer. The text for a hidden layer may be either obtained from
|
496
|
+
<a href="http://docs.google.com/View?docid=dfxcv4vc_67g844kf">hOCR</a>
|
497
|
+
files (hOCR is a HTML language extension, allowing to store information
|
498
|
+
about exact positioning of characters and markup elements on the page) or
|
499
|
+
imported from another PDF file.</p>
|
500
|
+
|
501
|
+
<p>Для создания файлов в формате hOCR необходимо воспользоваться программой
|
502
|
+
оптического распознавания символов, поддерживающей этот формат, например
|
503
|
+
<a href="https://launchpad.net/cuneiform-linux/">Cuneiform</a> или
|
504
|
+
<a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>.
|
505
|
+
Распознанный текст следует сохранить в той же директории, что и остальные
|
506
|
+
файлы, относящиеся к проекту. При этом каждой распознанной странице должен
|
507
|
+
соответствовать отдельный файл с тем же базовым именем, что и у исходного
|
508
|
+
изображения, при расширении HTM(L) или HOCR. Обработка файлов hOCR
|
509
|
+
осуществляется автоматически при условии, что интерпретатору Ruby доступно
|
510
|
+
расширение Nokogiri.</p>
|
511
|
+
|
512
|
+
<p>Иное возможное решение заключается в том, чтобы импортировать текстовый
|
513
|
+
слой из другого PDF-файла (естественно, последний должен быть получен путем
|
514
|
+
распознавания тех же самых изображений, которые предполагается затем обработать
|
515
|
+
с помощью pdfbeads). Имя полученного файла следует передать pdfbeads с помощью
|
516
|
+
ключа <tt>-T</tt> (полная форма — <tt>-text-pdf</tt>). Эта
|
517
|
+
возможность особенно важна в тех случаях, когда приходится использовать для
|
518
|
+
распознавания текста коммерческое приложение (например,
|
519
|
+
<a href="http://www.abbyy.ru/finereader/">ABBYY Finereader</a>), в котором
|
520
|
+
не предусмотрена поддержка формата hOCR. <strong>Внимание:</strong> при
|
521
|
+
создании промежуточного PDF-файла в ABBYY Finereader следует использовать
|
522
|
+
настройки «текст под изображением» или «текст поверх
|
523
|
+
изображения», поскольку при иных настройках размещение символов на
|
524
|
+
странице может оказаться не вполне соответствующим исходному графическому
|
525
|
+
файлу.</p>
|
526
|
+
|
527
|
+
<h3>Processing files with the right-to-left text direction</h3>
|
528
|
+
|
529
|
+
<p>The <tt>-R</tt> (or <tt>--right-to-left</tt> option allows to mark the
|
530
|
+
PDF file produced by pdfbeads with a special flag indicating that the main
|
531
|
+
text direction for the given document is right-to-left. This flag will allow
|
532
|
+
Adobe Reader™ to correctly order pages when displaying them in the
|
533
|
+
side-by-side mode.</p>
|
534
|
+
|
535
|
+
<h2>License</h2>
|
536
|
+
|
537
|
+
<p>This program is free software; you can redistribute it and/or modify
|
538
|
+
it under the terms of the GNU General Public License as published by
|
539
|
+
the Free Software Foundation; either version 2 of the License, or
|
540
|
+
(at your option) any later version.</p>
|
541
|
+
|
542
|
+
<p>This program is distributed in the hope that it will be useful,
|
543
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
544
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
545
|
+
GNU General Public License for more details.</p>
|
546
|
+
|
547
|
+
<p>You should have received a copy of the GNU General Public License
|
548
|
+
along with this program; if not, write to the Free Software
|
549
|
+
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.</p>
|
550
|
+
|
551
|
+
</body>
|
552
|
+
</html>
|