pdfbeads 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ ######################################################################
5
+ #
6
+ # PDFBeads -- convert scanned images to a single PDF file
7
+ # Version 1.0
8
+ #
9
+ # Unlike other PDF creation tools, this utility attempts to implement
10
+ # the approach typically used for DjVu books. Its key feature is
11
+ # separating scanned text (typically black, but indexed images with
12
+ # a small number of colors are also accepted) from halftone images
13
+ # placed into a background layer.
14
+ #
15
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
16
+ # All rights reserved.
17
+ #
18
+ # This program is free software; you can redistribute it and/or modify
19
+ # it under the terms of the GNU General Public License as published by
20
+ # the Free Software Foundation; either version 2 of the License, or
21
+ # (at your option) any later version.
22
+ #
23
+ # This program is distributed in the hope that it will be useful,
24
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
25
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
+ # GNU General Public License for more details.
27
+ #
28
+ # You should have received a copy of the GNU General Public License
29
+ # along with this program; if not, write to the Free Software
30
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31
+ #
32
+ #######################################################################
33
+
34
+ class PDFBeads::PDFBuilder::Dict < Hash
35
+ def initialize(pairs = {})
36
+ update( pairs )
37
+ super
38
+ end
39
+
40
+ def to_s()
41
+ s = "<<\n"
42
+ each_pair{ |key, value| s << "/#{key} #{value}\n" }
43
+ s << ">>\n"
44
+ s
45
+ end
46
+ end
47
+
48
+ class PDFBeads::PDFBuilder::XObj
49
+ @@next_id = 1
50
+
51
+ def initialize(d = {}, stream = nil)
52
+ reinit(d, stream)
53
+ @id = @@next_id
54
+ @@next_id += 1
55
+ end
56
+
57
+ def to_s()
58
+ s = ''
59
+ s << @d.to_s
60
+ unless @stream.nil?
61
+ s << "stream\n"
62
+ s << @stream
63
+ s << "\nendstream\n"
64
+ end
65
+ s << "endobj\n"
66
+ return s
67
+ end
68
+
69
+ def reinit(d = {}, stream = nil)
70
+ @d = PDFBuilder::Dict.new(d)
71
+ @stream = stream
72
+ @stream.to_binary if stream.kind_of? String
73
+ @d['Length'] = stream.length.to_s unless stream.nil?
74
+ end
75
+
76
+ def addToDict(key, value)
77
+ @d[key] = value
78
+ end
79
+
80
+ def hasInDict(key)
81
+ @d.has_key? key
82
+ end
83
+
84
+ def getFromDict(key)
85
+ @d[key]
86
+ end
87
+
88
+ def removeFromDict(key)
89
+ @d.delete(key)
90
+ end
91
+
92
+ def getID
93
+ @id
94
+ end
95
+
96
+ def dictLength
97
+ @d.length
98
+ end
99
+ end
100
+
101
+ class PDFBeads::PDFBuilder::Doc
102
+ def initialize()
103
+ @objs = Array.new()
104
+ @pages = Array.new()
105
+ end
106
+
107
+ def addObject(o)
108
+ @objs.push(o)
109
+ o
110
+ end
111
+
112
+ def addPage(p)
113
+ @pages.push(p)
114
+ addObject(p)
115
+ end
116
+
117
+ def to_s()
118
+ a = ''
119
+ j = 0
120
+ offsets = Array.new()
121
+
122
+ add = lambda{ |x|
123
+ x.to_binary
124
+ a << x
125
+ j += x.length
126
+ }
127
+ add.call( "%PDF-1.5\n" )
128
+ @objs.each do |xobj|
129
+ offsets << j
130
+ add.call( "#{xobj.getID} 0 obj\n" )
131
+ add.call( "#{xobj.to_s}\n" )
132
+ end
133
+ xrefstart = j
134
+ a << "xref\n"
135
+ a << "0 #{offsets.length + 1}\n"
136
+ a << "0000000000 65535 f \n"
137
+ offsets.each do |off|
138
+ a << sprintf("%010d 00000 n \n", off)
139
+ end
140
+ a << "\n"
141
+ a << "trailer\n"
142
+ a << "<< /Size #{offsets.length + 1} /Root 1 0 R /Info 2 0 R >>\n"
143
+ a << "startxref\n"
144
+ a << "#{xrefstart.to_s}\n"
145
+ a << "%%EOF"
146
+
147
+ a
148
+ end
149
+ end
@@ -0,0 +1,533 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+
4
+ ######################################################################
5
+ #
6
+ # PDFBeads -- convert scanned images to a single PDF file
7
+ # Version 1.0
8
+ #
9
+ # Unlike other PDF creation tools, this utility attempts to implement
10
+ # the approach typically used for DjVu books. Its key feature is
11
+ # separating scanned text (typically black, but indexed images with
12
+ # a small number of colors are also accepted) from halftone images
13
+ # placed into a background layer.
14
+ #
15
+ # Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
16
+ # All rights reserved.
17
+ #
18
+ # This program is free software; you can redistribute it and/or modify
19
+ # it under the terms of the GNU General Public License as published by
20
+ # the Free Software Foundation; either version 2 of the License, or
21
+ # (at your option) any later version.
22
+ #
23
+ # This program is distributed in the hope that it will be useful,
24
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
25
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
+ # GNU General Public License for more details.
27
+ #
28
+ # You should have received a copy of the GNU General Public License
29
+ # along with this program; if not, write to the Free Software
30
+ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31
+ #
32
+ #######################################################################
33
+
34
+ # This class provides auxiliary data (such as basic font properties or
35
+ # a width and a PostScript name for an arbitrary Unicode codepoint)
36
+ # for building a PDF font object with an arbitrary set of supported
37
+ # characters. Note that the returned properties (both of the entire font
38
+ # and individual characters) are hardcoded and correspond to those of
39
+ # Times New Roman. The reason is that we need this font just for a hidden
40
+ # text layer, so visual appearance doesn't matter.
41
+ #
42
+ class PDFBeads::PDFBuilder::FontDataProvider
43
+ # Access a hardcoded set of standard font properties (Ascent, Descent, etc.)
44
+ attr_reader :header
45
+
46
+ def initialize()
47
+ @header = Hash[
48
+ 'Ascent' => 694,
49
+ 'XHeight' => 447,
50
+ 'CapHeight' => 662,
51
+ 'Descent' => -213,
52
+ 'Flags' => 34,
53
+ 'FontBBox' => '[ -79 -216 1009 913 ]',
54
+ 'ItalicAngle' => 0,
55
+ 'StemV' => 81
56
+ ]
57
+
58
+ @chardata = Hash[
59
+ -1 => ["/.notdef", 250],
60
+ 0x0020 => ["/space", 250],
61
+ 0x0021 => ["/exclam", 333],
62
+ 0x0022 => ["/quotedbl", 408],
63
+ 0x0023 => ["/numbersign", 500],
64
+ 0x0024 => ["/dollar", 500],
65
+ 0x0025 => ["/percent", 833],
66
+ 0x0026 => ["/ampersand", 778],
67
+ 0x0027 => ["/quotesingle", 180],
68
+ 0x0028 => ["/parenleft", 333],
69
+ 0x0029 => ["/parenright", 333],
70
+ 0x002A => ["/asterisk", 500],
71
+ 0x002B => ["/plus", 564],
72
+ 0x002C => ["/comma", 250],
73
+ 0x002D => ["/hyphen", 333],
74
+ 0x002E => ["/period", 250],
75
+ 0x002F => ["/slash", 278],
76
+ 0x0030 => ["/zero", 500],
77
+ 0x0031 => ["/one", 500],
78
+ 0x0032 => ["/two", 500],
79
+ 0x0033 => ["/three", 500],
80
+ 0x0034 => ["/four", 500],
81
+ 0x0035 => ["/five", 500],
82
+ 0x0036 => ["/six", 500],
83
+ 0x0037 => ["/seven", 500],
84
+ 0x0038 => ["/eight", 500],
85
+ 0x0039 => ["/nine", 500],
86
+ 0x003A => ["/colon", 278],
87
+ 0x003B => ["/semicolon", 278],
88
+ 0x003C => ["/less", 564],
89
+ 0x003D => ["/equal", 564],
90
+ 0x003E => ["/greater", 564],
91
+ 0x003F => ["/question", 444],
92
+ 0x0040 => ["/at", 921],
93
+ 0x0041 => ["/A", 722],
94
+ 0x0042 => ["/B", 667],
95
+ 0x0043 => ["/C", 667],
96
+ 0x0044 => ["/D", 722],
97
+ 0x0045 => ["/E", 611],
98
+ 0x0046 => ["/F", 556],
99
+ 0x0047 => ["/G", 722],
100
+ 0x0048 => ["/H", 722],
101
+ 0x0049 => ["/I", 333],
102
+ 0x004A => ["/J", 389],
103
+ 0x004B => ["/K", 722],
104
+ 0x004C => ["/L", 611],
105
+ 0x004D => ["/M", 889],
106
+ 0x004E => ["/N", 722],
107
+ 0x004F => ["/O", 722],
108
+ 0x0050 => ["/P", 556],
109
+ 0x0051 => ["/Q", 722],
110
+ 0x0052 => ["/R", 667],
111
+ 0x0053 => ["/S", 556],
112
+ 0x0054 => ["/T", 611],
113
+ 0x0055 => ["/U", 722],
114
+ 0x0056 => ["/V", 722],
115
+ 0x0057 => ["/W", 944],
116
+ 0x0058 => ["/X", 722],
117
+ 0x0059 => ["/Y", 722],
118
+ 0x005A => ["/Z", 611],
119
+ 0x005B => ["/bracketleft", 333],
120
+ 0x005C => ["/backslash", 278],
121
+ 0x005D => ["/bracketright", 333],
122
+ 0x005E => ["/asciicircum", 469],
123
+ 0x005F => ["/underscore", 500],
124
+ 0x0060 => ["/grave", 333],
125
+ 0x0061 => ["/a", 444],
126
+ 0x0062 => ["/b", 500],
127
+ 0x0063 => ["/c", 444],
128
+ 0x0064 => ["/d", 500],
129
+ 0x0065 => ["/e", 444],
130
+ 0x0066 => ["/f", 333],
131
+ 0x0067 => ["/g", 500],
132
+ 0x0068 => ["/h", 500],
133
+ 0x0069 => ["/i", 278],
134
+ 0x006A => ["/j", 278],
135
+ 0x006B => ["/k", 500],
136
+ 0x006C => ["/l", 278],
137
+ 0x006D => ["/m", 778],
138
+ 0x006E => ["/n", 500],
139
+ 0x006F => ["/o", 500],
140
+ 0x0070 => ["/p", 500],
141
+ 0x0071 => ["/q", 500],
142
+ 0x0072 => ["/r", 333],
143
+ 0x0073 => ["/s", 389],
144
+ 0x0074 => ["/t", 278],
145
+ 0x0075 => ["/u", 500],
146
+ 0x0076 => ["/v", 500],
147
+ 0x0077 => ["/w", 722],
148
+ 0x0078 => ["/x", 500],
149
+ 0x0079 => ["/y", 500],
150
+ 0x007A => ["/z", 444],
151
+ 0x007B => ["/braceleft", 480],
152
+ 0x007C => ["/bar", 200],
153
+ 0x007D => ["/braceright", 480],
154
+ 0x007E => ["/asciitilde", 541],
155
+ 0x00A1 => ["/exclamdown", 333],
156
+ 0x00A2 => ["/cent", 500],
157
+ 0x00A3 => ["/sterling", 500],
158
+ 0x00A4 => ["/currency", 500],
159
+ 0x00A5 => ["/yen", 500],
160
+ 0x00A6 => ["/brokenbar", 200],
161
+ 0x00A7 => ["/section", 500],
162
+ 0x00A8 => ["/dieresis", 333],
163
+ 0x00A9 => ["/copyright", 760],
164
+ 0x00AA => ["/ordfeminine", 276],
165
+ 0x00AB => ["/guillemotleft", 500],
166
+ 0x00AC => ["/logicalnot", 564],
167
+ 0x00AD => ["/softhyphen", 333],
168
+ 0x00AE => ["/registered", 760],
169
+ 0x00AF => ["/macron", 333],
170
+ 0x00B0 => ["/degree", 400],
171
+ 0x00B1 => ["/plusminus", 564],
172
+ 0x00B2 => ["/twosuperior", 300],
173
+ 0x00B3 => ["/threesuperior", 300],
174
+ 0x00B4 => ["/acute", 333],
175
+ 0x00B5 => ["/mu", 500],
176
+ 0x00B6 => ["/paragraph", 453],
177
+ 0x00B7 => ["/periodcentered", 250],
178
+ 0x00B8 => ["/cedilla", 333],
179
+ 0x00B9 => ["/onesuperior", 300],
180
+ 0x00BA => ["/ordmasculine", 310],
181
+ 0x00BB => ["/guillemotright", 500],
182
+ 0x00BC => ["/onequarter", 750],
183
+ 0x00BD => ["/onehalf", 750],
184
+ 0x00BE => ["/threequarters", 750],
185
+ 0x00BF => ["/questiondown", 444],
186
+ 0x00C0 => ["/Agrave", 722],
187
+ 0x00C1 => ["/Aacute", 722],
188
+ 0x00C2 => ["/Acircumflex", 722],
189
+ 0x00C3 => ["/Atilde", 722],
190
+ 0x00C4 => ["/Adieresis", 722],
191
+ 0x00C5 => ["/Aring", 722],
192
+ 0x00C6 => ["/AE", 889],
193
+ 0x00C7 => ["/Ccedilla", 667],
194
+ 0x00C8 => ["/Egrave", 611],
195
+ 0x00C9 => ["/Eacute", 611],
196
+ 0x00CA => ["/Ecircumflex", 611],
197
+ 0x00CB => ["/Edieresis", 611],
198
+ 0x00CC => ["/Igrave", 333],
199
+ 0x00CD => ["/Iacute", 333],
200
+ 0x00CE => ["/Icircumflex", 333],
201
+ 0x00CF => ["/Idieresis", 333],
202
+ 0x00D0 => ["/Eth", 722],
203
+ 0x00D1 => ["/Ntilde", 722],
204
+ 0x00D2 => ["/Ograve", 722],
205
+ 0x00D3 => ["/Oacute", 722],
206
+ 0x00D4 => ["/Ocircumflex", 722],
207
+ 0x00D5 => ["/Otilde", 722],
208
+ 0x00D6 => ["/Odieresis", 722],
209
+ 0x00D7 => ["/multiply", 564],
210
+ 0x00D8 => ["/Oslash", 722],
211
+ 0x00D9 => ["/Ugrave", 722],
212
+ 0x00DA => ["/Uacute", 722],
213
+ 0x00DB => ["/Ucircumflex", 722],
214
+ 0x00DC => ["/Udieresis", 722],
215
+ 0x00DD => ["/Yacute", 722],
216
+ 0x00DE => ["/Thorn", 556],
217
+ 0x00DF => ["/germandbls", 500],
218
+ 0x00E0 => ["/agrave", 444],
219
+ 0x00E1 => ["/aacute", 444],
220
+ 0x00E2 => ["/acircumflex", 444],
221
+ 0x00E3 => ["/atilde", 444],
222
+ 0x00E4 => ["/adieresis", 444],
223
+ 0x00E5 => ["/aring", 444],
224
+ 0x00E6 => ["/ae", 667],
225
+ 0x00E7 => ["/ccedilla", 444],
226
+ 0x00E8 => ["/egrave", 444],
227
+ 0x00E9 => ["/eacute", 444],
228
+ 0x00EA => ["/ecircumflex", 444],
229
+ 0x00EB => ["/edieresis", 444],
230
+ 0x00EC => ["/igrave", 278],
231
+ 0x00ED => ["/iacute", 278],
232
+ 0x00EE => ["/icircumflex", 278],
233
+ 0x00EF => ["/idieresis", 278],
234
+ 0x00F0 => ["/eth", 500],
235
+ 0x00F1 => ["/ntilde", 500],
236
+ 0x00F2 => ["/ograve", 500],
237
+ 0x00F3 => ["/oacute", 500],
238
+ 0x00F4 => ["/ocircumflex", 500],
239
+ 0x00F5 => ["/otilde", 500],
240
+ 0x00F6 => ["/odieresis", 500],
241
+ 0x00F7 => ["/divide", 564],
242
+ 0x00F8 => ["/oslash", 500],
243
+ 0x00F9 => ["/ugrave", 500],
244
+ 0x00FA => ["/uacute", 500],
245
+ 0x00FB => ["/ucircumflex", 500],
246
+ 0x00FC => ["/udieresis", 500],
247
+ 0x00FD => ["/yacute", 500],
248
+ 0x00FE => ["/thorn", 500],
249
+ 0x00FF => ["/ydieresis", 500],
250
+ 0x0131 => ["/dotlessi", 278],
251
+ 0x0141 => ["/Lslash", 611],
252
+ 0x0142 => ["/lslash", 278],
253
+ 0x0152 => ["/OE", 889],
254
+ 0x0153 => ["/oe", 722],
255
+ 0x0160 => ["/Scaron", 556],
256
+ 0x0161 => ["/scaron", 389],
257
+ 0x0178 => ["/Ydieresis", 722],
258
+ 0x017D => ["/Zcaron", 611],
259
+ 0x017E => ["/zcaron", 444],
260
+ 0x0192 => ["/florin", 488],
261
+ 0x02C6 => ["/circumflex", 333],
262
+ 0x02C7 => ["/caron", 333],
263
+ 0x02D8 => ["/breve", 333],
264
+ 0x02D9 => ["/dotaccent", 333],
265
+ 0x02DA => ["/ring", 333],
266
+ 0x02DB => ["/ogonek", 333],
267
+ 0x02DC => ["/tilde", 333],
268
+ 0x02DD => ["/hungarumlaut", 333],
269
+ 0x0394 => ["/Delta", 643],
270
+ 0x0401 => ["/afii10023", 611],
271
+ 0x0402 => ["/afii10051", 752],
272
+ 0x0403 => ["/afii10052", 578],
273
+ 0x0404 => ["/afii10053", 660],
274
+ 0x0405 => ["/afii10054", 556],
275
+ 0x0406 => ["/afii10055", 333],
276
+ 0x0407 => ["/afii10056", 333],
277
+ 0x0408 => ["/afii10057", 389],
278
+ 0x0409 => ["/afii10058", 872],
279
+ 0x040A => ["/afii10059", 872],
280
+ 0x040B => ["/afii10060", 741],
281
+ 0x040C => ["/afii10061", 667],
282
+ 0x040E => ["/afii10062", 708],
283
+ 0x040F => ["/afii10145", 722],
284
+ 0x0410 => ["/afii10017", 722],
285
+ 0x0411 => ["/afii10018", 574],
286
+ 0x0412 => ["/afii10019", 667],
287
+ 0x0413 => ["/afii10020", 578],
288
+ 0x0414 => ["/afii10021", 682],
289
+ 0x0415 => ["/afii10022", 611],
290
+ 0x0416 => ["/afii10024", 896],
291
+ 0x0417 => ["/afii10025", 501],
292
+ 0x0418 => ["/afii10026", 722],
293
+ 0x0419 => ["/afii10027", 722],
294
+ 0x041A => ["/afii10028", 667],
295
+ 0x041B => ["/afii10029", 678],
296
+ 0x041C => ["/afii10030", 889],
297
+ 0x041D => ["/afii10031", 722],
298
+ 0x041E => ["/afii10032", 722],
299
+ 0x041F => ["/afii10033", 722],
300
+ 0x0420 => ["/afii10034", 556],
301
+ 0x0421 => ["/afii10035", 667],
302
+ 0x0422 => ["/afii10036", 611],
303
+ 0x0423 => ["/afii10037", 708],
304
+ 0x0424 => ["/afii10038", 790],
305
+ 0x0425 => ["/afii10039", 722],
306
+ 0x0426 => ["/afii10040", 722],
307
+ 0x0427 => ["/afii10041", 650],
308
+ 0x0428 => ["/afii10042", 1009],
309
+ 0x0429 => ["/afii10043", 1009],
310
+ 0x042A => ["/afii10044", 706],
311
+ 0x042B => ["/afii10045", 872],
312
+ 0x042C => ["/afii10046", 574],
313
+ 0x042D => ["/afii10047", 660],
314
+ 0x042E => ["/afii10048", 1028],
315
+ 0x042F => ["/afii10049", 667],
316
+ 0x0430 => ["/afii10065", 444],
317
+ 0x0431 => ["/afii10066", 509],
318
+ 0x0432 => ["/afii10067", 472],
319
+ 0x0433 => ["/afii10068", 410],
320
+ 0x0434 => ["/afii10069", 509],
321
+ 0x0435 => ["/afii10070", 444],
322
+ 0x0436 => ["/afii10072", 691],
323
+ 0x0437 => ["/afii10073", 395],
324
+ 0x0438 => ["/afii10074", 535],
325
+ 0x0439 => ["/afii10075", 535],
326
+ 0x043A => ["/afii10076", 486],
327
+ 0x043B => ["/afii10077", 499],
328
+ 0x043C => ["/afii10078", 633],
329
+ 0x043D => ["/afii10079", 535],
330
+ 0x043E => ["/afii10080", 500],
331
+ 0x043F => ["/afii10081", 535],
332
+ 0x0440 => ["/afii10082", 500],
333
+ 0x0441 => ["/afii10083", 444],
334
+ 0x0442 => ["/afii10084", 437],
335
+ 0x0443 => ["/afii10085", 500],
336
+ 0x0444 => ["/afii10086", 648],
337
+ 0x0445 => ["/afii10087", 500],
338
+ 0x0446 => ["/afii10088", 535],
339
+ 0x0447 => ["/afii10089", 503],
340
+ 0x0448 => ["/afii10090", 770],
341
+ 0x0449 => ["/afii10091", 770],
342
+ 0x044A => ["/afii10092", 517],
343
+ 0x044B => ["/afii10093", 672],
344
+ 0x044C => ["/afii10094", 456],
345
+ 0x044D => ["/afii10095", 429],
346
+ 0x044E => ["/afii10096", 747],
347
+ 0x044F => ["/afii10097", 460],
348
+ 0x0451 => ["/afii10071", 444],
349
+ 0x0452 => ["/afii10099", 483],
350
+ 0x0453 => ["/afii10100", 410],
351
+ 0x0454 => ["/afii10101", 429],
352
+ 0x0455 => ["/afii10102", 389],
353
+ 0x0456 => ["/afii10103", 278],
354
+ 0x0457 => ["/afii10104", 278],
355
+ 0x0458 => ["/afii10105", 278],
356
+ 0x0459 => ["/afii10106", 727],
357
+ 0x045A => ["/afii10107", 723],
358
+ 0x045B => ["/afii10108", 500],
359
+ 0x045C => ["/afii10109", 486],
360
+ 0x045E => ["/afii10110", 500],
361
+ 0x045F => ["/afii10193", 535],
362
+ 0x0462 => ["/afii10146", 648],
363
+ 0x0463 => ["/afii10194", 514],
364
+ 0x0472 => ["/afii10147", 722],
365
+ 0x0473 => ["/afii10195", 500],
366
+ 0x0474 => ["/afii10148", 771],
367
+ 0x0475 => ["/afii10196", 536],
368
+ 0x0490 => ["/afii10050", 450],
369
+ 0x0491 => ["/afii10098", 351],
370
+ 0x2013 => ["/endash", 500],
371
+ 0x2014 => ["/emdash", 1000],
372
+ 0x2018 => ["/quoteleft", 333],
373
+ 0x2019 => ["/quoteright", 333],
374
+ 0x201A => ["/quotesinglbase", 333],
375
+ 0x201C => ["/quotedblleft", 444],
376
+ 0x201D => ["/quotedblright", 444],
377
+ 0x201E => ["/quotedblbase", 444],
378
+ 0x2020 => ["/dagger", 500],
379
+ 0x2021 => ["/daggerdbl", 500],
380
+ 0x2022 => ["/bullet", 350],
381
+ 0x2026 => ["/ellipsis", 1000],
382
+ 0x2030 => ["/perthousand", 1000],
383
+ 0x2039 => ["/guilsinglleft", 333],
384
+ 0x203A => ["/guilsinglright", 333],
385
+ 0x20AC => ["/Euro", 500],
386
+ 0x2116 => ["/afii61352", 954],
387
+ 0x2122 => ["/trademark", 980],
388
+ 0x2202 => ["/partialdiff", 490],
389
+ 0x2212 => ["/minus", 564],
390
+ 0x221A => ["/radical", 552],
391
+ 0x221E => ["/infinity", 708],
392
+ 0x2248 => ["/approxequal", 564],
393
+ 0x2260 => ["/notequal", 564],
394
+ 0x2264 => ["/lessequal", 564],
395
+ 0x2265 => ["/greaterequal", 564],
396
+ 0xFB01 => ["/fi", 556],
397
+ 0xFB02 => ["/fl", 556],
398
+ ]
399
+ @chardata.default = proc do |fd, uni|
400
+ fd[uni] = [ sprintf( "/uni%04X",uni ), 500 ]
401
+ end
402
+
403
+ @encodings = Array.new()
404
+ @wlists = Array.new()
405
+ end
406
+
407
+ # Return the width of a given UTF-8 string formatted with our hardcoded
408
+ # font at a given point size
409
+ def getLineWidth( line,size )
410
+ w = 0.0
411
+ line.each_char do |uc|
412
+ begin
413
+ w += @chardata[uc.ord][1] * size / 1000.0
414
+ rescue
415
+ rawbytes = uc.unpack( 'C*' )
416
+ bs = ''
417
+ rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
418
+ $stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
419
+ w += ( @chardata[0x003F][1] * size / 1000.0 ) * rawbytes.length
420
+ end
421
+ end
422
+ w.to_f
423
+ end
424
+
425
+ # Take an array of UTF-8 characters and return an array of the
426
+ # corresponding PostScript glyph names
427
+ def getEncoding( enc )
428
+ ret = Array.new()
429
+ enc.each do |char|
430
+ ret << @chardata[char.ord][0]
431
+ end
432
+ ret
433
+ end
434
+
435
+ # Take an array of UTF-8 characters and return an array of the
436
+ # corresponding glyph widths
437
+ def getWidths( enc )
438
+ ret = Array.new()
439
+ enc.each do |char|
440
+ ret << @chardata[char.ord][1]
441
+ end
442
+ ret
443
+ end
444
+
445
+ # Take an array of UTF-8 characters and return the corresponding
446
+ # ToUnicode cmap object
447
+ def getCMAP( enc )
448
+ cmap = [
449
+ "/CIDInit /ProcSet findresource begin\n",
450
+ "12 dict begin\n",
451
+ "begincmap\n",
452
+ "/CIDSystemInfo\n",
453
+ "<<\n",
454
+ " /Registry ( PDFBeads )\n",
455
+ " /Ordering ( Custom )\n",
456
+ " /Supplement 0\n",
457
+ ">> def\n",
458
+ "/CMapName /PDFBeads-Custom def\n",
459
+ "/CMapType 2 def\n",
460
+ "1 begincodespacerange\n",
461
+ "<00> <FF>\n",
462
+ "endcodespacerange\n",
463
+ ].join( '' )
464
+ ranges = Array.new()
465
+ cur_range = nil
466
+ prev = -1
467
+ numbfchar = 0
468
+ enc.each_index do |i|
469
+ cur = enc[i].ord
470
+ if cur == prev + 1
471
+ if cur_range.nil?
472
+ cur_range = Hash[
473
+ 'start' => i-1,
474
+ 'end' => i,
475
+ 'uni' => prev
476
+ ]
477
+ numbfchar -= 1
478
+ else
479
+ cur_range['end'] = i
480
+ end
481
+ elsif cur_range != nil
482
+ ranges << cur_range
483
+ cur_range = nil
484
+ end
485
+
486
+ if cur_range.nil? and cur != -1
487
+ numbfchar += 1
488
+ end
489
+ prev = cur
490
+ end
491
+
492
+ unless cur_range.nil?
493
+ ranges << cur_range
494
+ cur_range = nil
495
+ end
496
+
497
+ if ranges.length > 0
498
+ cmap << "#{ranges.length} beginbfrange\n"
499
+ ranges.each do |cr|
500
+ cmap << sprintf( "<%02X> <%02X> <%04X>\n",
501
+ cr['start'], cr['end'], cr['uni'] )
502
+ end
503
+ cmap << "endbfrange\n"
504
+ end
505
+
506
+ if numbfchar > 0
507
+ cmap += "%d beginbfchar\n" % numbfchar
508
+ enc.each_index do |i|
509
+ in_range = false
510
+ ranges.each do |cr|
511
+ if i >= cr['start'] and i <= cr['end']
512
+ in_range = true
513
+ break
514
+ end
515
+ end
516
+
517
+ cmap << sprintf( "<%02X> <%04X>\n", i, enc[i].ord ) unless in_range
518
+ end
519
+
520
+ cmap << "endbfchar\n"
521
+ end
522
+
523
+ cmap << "endcmap\n"
524
+ cmap << "CMapName currentdict /CMap defineresource pop\n"
525
+ cmap << "end\n"
526
+ cmap << "end\n"
527
+
528
+ toUnicode = PDFBuilder::XObj.new( Hash[
529
+ 'Filter' => '/FlateDecode',
530
+ ], Zlib::Deflate.deflate( cmap,9 ) )
531
+ toUnicode
532
+ end
533
+ end