pdfbeads 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +339 -0
- data/ChangeLog +3 -0
- data/README +53 -0
- data/bin/pdfbeads +189 -0
- data/doc/pdfbeads.ru.html +509 -0
- data/lib/imageinspector.rb +503 -0
- data/lib/pdfbeads.rb +93 -0
- data/lib/pdfbeads/pdfbuilder.rb +699 -0
- data/lib/pdfbeads/pdfdoc.rb +149 -0
- data/lib/pdfbeads/pdffont.rb +533 -0
- data/lib/pdfbeads/pdflabels.rb +139 -0
- data/lib/pdfbeads/pdfpage.rb +466 -0
- data/lib/pdfbeads/pdftoc.rb +160 -0
- metadata +82 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
4
|
+
######################################################################
|
5
|
+
#
|
6
|
+
# PDFBeads -- convert scanned images to a single PDF file
|
7
|
+
# Version 1.0
|
8
|
+
#
|
9
|
+
# Unlike other PDF creation tools, this utility attempts to implement
|
10
|
+
# the approach typically used for DjVu books. Its key feature is
|
11
|
+
# separating scanned text (typically black, but indexed images with
|
12
|
+
# a small number of colors are also accepted) from halftone images
|
13
|
+
# placed into a background layer.
|
14
|
+
#
|
15
|
+
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
16
|
+
# All rights reserved.
|
17
|
+
#
|
18
|
+
# This program is free software; you can redistribute it and/or modify
|
19
|
+
# it under the terms of the GNU General Public License as published by
|
20
|
+
# the Free Software Foundation; either version 2 of the License, or
|
21
|
+
# (at your option) any later version.
|
22
|
+
#
|
23
|
+
# This program is distributed in the hope that it will be useful,
|
24
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
25
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
26
|
+
# GNU General Public License for more details.
|
27
|
+
#
|
28
|
+
# You should have received a copy of the GNU General Public License
|
29
|
+
# along with this program; if not, write to the Free Software
|
30
|
+
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
31
|
+
#
|
32
|
+
#######################################################################
|
33
|
+
|
34
|
+
class PDFBeads::PDFBuilder::Dict < Hash
|
35
|
+
def initialize(pairs = {})
|
36
|
+
update( pairs )
|
37
|
+
super
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s()
|
41
|
+
s = "<<\n"
|
42
|
+
each_pair{ |key, value| s << "/#{key} #{value}\n" }
|
43
|
+
s << ">>\n"
|
44
|
+
s
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
class PDFBeads::PDFBuilder::XObj
|
49
|
+
@@next_id = 1
|
50
|
+
|
51
|
+
def initialize(d = {}, stream = nil)
|
52
|
+
reinit(d, stream)
|
53
|
+
@id = @@next_id
|
54
|
+
@@next_id += 1
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_s()
|
58
|
+
s = ''
|
59
|
+
s << @d.to_s
|
60
|
+
unless @stream.nil?
|
61
|
+
s << "stream\n"
|
62
|
+
s << @stream
|
63
|
+
s << "\nendstream\n"
|
64
|
+
end
|
65
|
+
s << "endobj\n"
|
66
|
+
return s
|
67
|
+
end
|
68
|
+
|
69
|
+
def reinit(d = {}, stream = nil)
|
70
|
+
@d = PDFBuilder::Dict.new(d)
|
71
|
+
@stream = stream
|
72
|
+
@stream.to_binary if stream.kind_of? String
|
73
|
+
@d['Length'] = stream.length.to_s unless stream.nil?
|
74
|
+
end
|
75
|
+
|
76
|
+
def addToDict(key, value)
|
77
|
+
@d[key] = value
|
78
|
+
end
|
79
|
+
|
80
|
+
def hasInDict(key)
|
81
|
+
@d.has_key? key
|
82
|
+
end
|
83
|
+
|
84
|
+
def getFromDict(key)
|
85
|
+
@d[key]
|
86
|
+
end
|
87
|
+
|
88
|
+
def removeFromDict(key)
|
89
|
+
@d.delete(key)
|
90
|
+
end
|
91
|
+
|
92
|
+
def getID
|
93
|
+
@id
|
94
|
+
end
|
95
|
+
|
96
|
+
def dictLength
|
97
|
+
@d.length
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
class PDFBeads::PDFBuilder::Doc
|
102
|
+
def initialize()
|
103
|
+
@objs = Array.new()
|
104
|
+
@pages = Array.new()
|
105
|
+
end
|
106
|
+
|
107
|
+
def addObject(o)
|
108
|
+
@objs.push(o)
|
109
|
+
o
|
110
|
+
end
|
111
|
+
|
112
|
+
def addPage(p)
|
113
|
+
@pages.push(p)
|
114
|
+
addObject(p)
|
115
|
+
end
|
116
|
+
|
117
|
+
def to_s()
|
118
|
+
a = ''
|
119
|
+
j = 0
|
120
|
+
offsets = Array.new()
|
121
|
+
|
122
|
+
add = lambda{ |x|
|
123
|
+
x.to_binary
|
124
|
+
a << x
|
125
|
+
j += x.length
|
126
|
+
}
|
127
|
+
add.call( "%PDF-1.5\n" )
|
128
|
+
@objs.each do |xobj|
|
129
|
+
offsets << j
|
130
|
+
add.call( "#{xobj.getID} 0 obj\n" )
|
131
|
+
add.call( "#{xobj.to_s}\n" )
|
132
|
+
end
|
133
|
+
xrefstart = j
|
134
|
+
a << "xref\n"
|
135
|
+
a << "0 #{offsets.length + 1}\n"
|
136
|
+
a << "0000000000 65535 f \n"
|
137
|
+
offsets.each do |off|
|
138
|
+
a << sprintf("%010d 00000 n \n", off)
|
139
|
+
end
|
140
|
+
a << "\n"
|
141
|
+
a << "trailer\n"
|
142
|
+
a << "<< /Size #{offsets.length + 1} /Root 1 0 R /Info 2 0 R >>\n"
|
143
|
+
a << "startxref\n"
|
144
|
+
a << "#{xrefstart.to_s}\n"
|
145
|
+
a << "%%EOF"
|
146
|
+
|
147
|
+
a
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,533 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
4
|
+
######################################################################
|
5
|
+
#
|
6
|
+
# PDFBeads -- convert scanned images to a single PDF file
|
7
|
+
# Version 1.0
|
8
|
+
#
|
9
|
+
# Unlike other PDF creation tools, this utility attempts to implement
|
10
|
+
# the approach typically used for DjVu books. Its key feature is
|
11
|
+
# separating scanned text (typically black, but indexed images with
|
12
|
+
# a small number of colors are also accepted) from halftone images
|
13
|
+
# placed into a background layer.
|
14
|
+
#
|
15
|
+
# Copyright (C) 2010 Alexey Kryukov (amkryukov@gmail.com).
|
16
|
+
# All rights reserved.
|
17
|
+
#
|
18
|
+
# This program is free software; you can redistribute it and/or modify
|
19
|
+
# it under the terms of the GNU General Public License as published by
|
20
|
+
# the Free Software Foundation; either version 2 of the License, or
|
21
|
+
# (at your option) any later version.
|
22
|
+
#
|
23
|
+
# This program is distributed in the hope that it will be useful,
|
24
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
25
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
26
|
+
# GNU General Public License for more details.
|
27
|
+
#
|
28
|
+
# You should have received a copy of the GNU General Public License
|
29
|
+
# along with this program; if not, write to the Free Software
|
30
|
+
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
31
|
+
#
|
32
|
+
#######################################################################
|
33
|
+
|
34
|
+
# This class provides auxiliary data (such as basic font properties or
|
35
|
+
# a width and a PostScript name for an arbitrary Unicode codepoint)
|
36
|
+
# for building a PDF font object with an arbitrary set of supported
|
37
|
+
# characters. Note that the returned properties (both of the entire font
|
38
|
+
# and individual characters) are hardcoded and correspond to those of
|
39
|
+
# Times New Roman. The reason is that we need this font just for a hidden
|
40
|
+
# text layer, so visual appearance doesn't matter.
|
41
|
+
#
|
42
|
+
class PDFBeads::PDFBuilder::FontDataProvider
|
43
|
+
# Access a hardcoded set of standard font properties (Ascent, Descent, etc.)
|
44
|
+
attr_reader :header
|
45
|
+
|
46
|
+
def initialize()
|
47
|
+
@header = Hash[
|
48
|
+
'Ascent' => 694,
|
49
|
+
'XHeight' => 447,
|
50
|
+
'CapHeight' => 662,
|
51
|
+
'Descent' => -213,
|
52
|
+
'Flags' => 34,
|
53
|
+
'FontBBox' => '[ -79 -216 1009 913 ]',
|
54
|
+
'ItalicAngle' => 0,
|
55
|
+
'StemV' => 81
|
56
|
+
]
|
57
|
+
|
58
|
+
@chardata = Hash[
|
59
|
+
-1 => ["/.notdef", 250],
|
60
|
+
0x0020 => ["/space", 250],
|
61
|
+
0x0021 => ["/exclam", 333],
|
62
|
+
0x0022 => ["/quotedbl", 408],
|
63
|
+
0x0023 => ["/numbersign", 500],
|
64
|
+
0x0024 => ["/dollar", 500],
|
65
|
+
0x0025 => ["/percent", 833],
|
66
|
+
0x0026 => ["/ampersand", 778],
|
67
|
+
0x0027 => ["/quotesingle", 180],
|
68
|
+
0x0028 => ["/parenleft", 333],
|
69
|
+
0x0029 => ["/parenright", 333],
|
70
|
+
0x002A => ["/asterisk", 500],
|
71
|
+
0x002B => ["/plus", 564],
|
72
|
+
0x002C => ["/comma", 250],
|
73
|
+
0x002D => ["/hyphen", 333],
|
74
|
+
0x002E => ["/period", 250],
|
75
|
+
0x002F => ["/slash", 278],
|
76
|
+
0x0030 => ["/zero", 500],
|
77
|
+
0x0031 => ["/one", 500],
|
78
|
+
0x0032 => ["/two", 500],
|
79
|
+
0x0033 => ["/three", 500],
|
80
|
+
0x0034 => ["/four", 500],
|
81
|
+
0x0035 => ["/five", 500],
|
82
|
+
0x0036 => ["/six", 500],
|
83
|
+
0x0037 => ["/seven", 500],
|
84
|
+
0x0038 => ["/eight", 500],
|
85
|
+
0x0039 => ["/nine", 500],
|
86
|
+
0x003A => ["/colon", 278],
|
87
|
+
0x003B => ["/semicolon", 278],
|
88
|
+
0x003C => ["/less", 564],
|
89
|
+
0x003D => ["/equal", 564],
|
90
|
+
0x003E => ["/greater", 564],
|
91
|
+
0x003F => ["/question", 444],
|
92
|
+
0x0040 => ["/at", 921],
|
93
|
+
0x0041 => ["/A", 722],
|
94
|
+
0x0042 => ["/B", 667],
|
95
|
+
0x0043 => ["/C", 667],
|
96
|
+
0x0044 => ["/D", 722],
|
97
|
+
0x0045 => ["/E", 611],
|
98
|
+
0x0046 => ["/F", 556],
|
99
|
+
0x0047 => ["/G", 722],
|
100
|
+
0x0048 => ["/H", 722],
|
101
|
+
0x0049 => ["/I", 333],
|
102
|
+
0x004A => ["/J", 389],
|
103
|
+
0x004B => ["/K", 722],
|
104
|
+
0x004C => ["/L", 611],
|
105
|
+
0x004D => ["/M", 889],
|
106
|
+
0x004E => ["/N", 722],
|
107
|
+
0x004F => ["/O", 722],
|
108
|
+
0x0050 => ["/P", 556],
|
109
|
+
0x0051 => ["/Q", 722],
|
110
|
+
0x0052 => ["/R", 667],
|
111
|
+
0x0053 => ["/S", 556],
|
112
|
+
0x0054 => ["/T", 611],
|
113
|
+
0x0055 => ["/U", 722],
|
114
|
+
0x0056 => ["/V", 722],
|
115
|
+
0x0057 => ["/W", 944],
|
116
|
+
0x0058 => ["/X", 722],
|
117
|
+
0x0059 => ["/Y", 722],
|
118
|
+
0x005A => ["/Z", 611],
|
119
|
+
0x005B => ["/bracketleft", 333],
|
120
|
+
0x005C => ["/backslash", 278],
|
121
|
+
0x005D => ["/bracketright", 333],
|
122
|
+
0x005E => ["/asciicircum", 469],
|
123
|
+
0x005F => ["/underscore", 500],
|
124
|
+
0x0060 => ["/grave", 333],
|
125
|
+
0x0061 => ["/a", 444],
|
126
|
+
0x0062 => ["/b", 500],
|
127
|
+
0x0063 => ["/c", 444],
|
128
|
+
0x0064 => ["/d", 500],
|
129
|
+
0x0065 => ["/e", 444],
|
130
|
+
0x0066 => ["/f", 333],
|
131
|
+
0x0067 => ["/g", 500],
|
132
|
+
0x0068 => ["/h", 500],
|
133
|
+
0x0069 => ["/i", 278],
|
134
|
+
0x006A => ["/j", 278],
|
135
|
+
0x006B => ["/k", 500],
|
136
|
+
0x006C => ["/l", 278],
|
137
|
+
0x006D => ["/m", 778],
|
138
|
+
0x006E => ["/n", 500],
|
139
|
+
0x006F => ["/o", 500],
|
140
|
+
0x0070 => ["/p", 500],
|
141
|
+
0x0071 => ["/q", 500],
|
142
|
+
0x0072 => ["/r", 333],
|
143
|
+
0x0073 => ["/s", 389],
|
144
|
+
0x0074 => ["/t", 278],
|
145
|
+
0x0075 => ["/u", 500],
|
146
|
+
0x0076 => ["/v", 500],
|
147
|
+
0x0077 => ["/w", 722],
|
148
|
+
0x0078 => ["/x", 500],
|
149
|
+
0x0079 => ["/y", 500],
|
150
|
+
0x007A => ["/z", 444],
|
151
|
+
0x007B => ["/braceleft", 480],
|
152
|
+
0x007C => ["/bar", 200],
|
153
|
+
0x007D => ["/braceright", 480],
|
154
|
+
0x007E => ["/asciitilde", 541],
|
155
|
+
0x00A1 => ["/exclamdown", 333],
|
156
|
+
0x00A2 => ["/cent", 500],
|
157
|
+
0x00A3 => ["/sterling", 500],
|
158
|
+
0x00A4 => ["/currency", 500],
|
159
|
+
0x00A5 => ["/yen", 500],
|
160
|
+
0x00A6 => ["/brokenbar", 200],
|
161
|
+
0x00A7 => ["/section", 500],
|
162
|
+
0x00A8 => ["/dieresis", 333],
|
163
|
+
0x00A9 => ["/copyright", 760],
|
164
|
+
0x00AA => ["/ordfeminine", 276],
|
165
|
+
0x00AB => ["/guillemotleft", 500],
|
166
|
+
0x00AC => ["/logicalnot", 564],
|
167
|
+
0x00AD => ["/softhyphen", 333],
|
168
|
+
0x00AE => ["/registered", 760],
|
169
|
+
0x00AF => ["/macron", 333],
|
170
|
+
0x00B0 => ["/degree", 400],
|
171
|
+
0x00B1 => ["/plusminus", 564],
|
172
|
+
0x00B2 => ["/twosuperior", 300],
|
173
|
+
0x00B3 => ["/threesuperior", 300],
|
174
|
+
0x00B4 => ["/acute", 333],
|
175
|
+
0x00B5 => ["/mu", 500],
|
176
|
+
0x00B6 => ["/paragraph", 453],
|
177
|
+
0x00B7 => ["/periodcentered", 250],
|
178
|
+
0x00B8 => ["/cedilla", 333],
|
179
|
+
0x00B9 => ["/onesuperior", 300],
|
180
|
+
0x00BA => ["/ordmasculine", 310],
|
181
|
+
0x00BB => ["/guillemotright", 500],
|
182
|
+
0x00BC => ["/onequarter", 750],
|
183
|
+
0x00BD => ["/onehalf", 750],
|
184
|
+
0x00BE => ["/threequarters", 750],
|
185
|
+
0x00BF => ["/questiondown", 444],
|
186
|
+
0x00C0 => ["/Agrave", 722],
|
187
|
+
0x00C1 => ["/Aacute", 722],
|
188
|
+
0x00C2 => ["/Acircumflex", 722],
|
189
|
+
0x00C3 => ["/Atilde", 722],
|
190
|
+
0x00C4 => ["/Adieresis", 722],
|
191
|
+
0x00C5 => ["/Aring", 722],
|
192
|
+
0x00C6 => ["/AE", 889],
|
193
|
+
0x00C7 => ["/Ccedilla", 667],
|
194
|
+
0x00C8 => ["/Egrave", 611],
|
195
|
+
0x00C9 => ["/Eacute", 611],
|
196
|
+
0x00CA => ["/Ecircumflex", 611],
|
197
|
+
0x00CB => ["/Edieresis", 611],
|
198
|
+
0x00CC => ["/Igrave", 333],
|
199
|
+
0x00CD => ["/Iacute", 333],
|
200
|
+
0x00CE => ["/Icircumflex", 333],
|
201
|
+
0x00CF => ["/Idieresis", 333],
|
202
|
+
0x00D0 => ["/Eth", 722],
|
203
|
+
0x00D1 => ["/Ntilde", 722],
|
204
|
+
0x00D2 => ["/Ograve", 722],
|
205
|
+
0x00D3 => ["/Oacute", 722],
|
206
|
+
0x00D4 => ["/Ocircumflex", 722],
|
207
|
+
0x00D5 => ["/Otilde", 722],
|
208
|
+
0x00D6 => ["/Odieresis", 722],
|
209
|
+
0x00D7 => ["/multiply", 564],
|
210
|
+
0x00D8 => ["/Oslash", 722],
|
211
|
+
0x00D9 => ["/Ugrave", 722],
|
212
|
+
0x00DA => ["/Uacute", 722],
|
213
|
+
0x00DB => ["/Ucircumflex", 722],
|
214
|
+
0x00DC => ["/Udieresis", 722],
|
215
|
+
0x00DD => ["/Yacute", 722],
|
216
|
+
0x00DE => ["/Thorn", 556],
|
217
|
+
0x00DF => ["/germandbls", 500],
|
218
|
+
0x00E0 => ["/agrave", 444],
|
219
|
+
0x00E1 => ["/aacute", 444],
|
220
|
+
0x00E2 => ["/acircumflex", 444],
|
221
|
+
0x00E3 => ["/atilde", 444],
|
222
|
+
0x00E4 => ["/adieresis", 444],
|
223
|
+
0x00E5 => ["/aring", 444],
|
224
|
+
0x00E6 => ["/ae", 667],
|
225
|
+
0x00E7 => ["/ccedilla", 444],
|
226
|
+
0x00E8 => ["/egrave", 444],
|
227
|
+
0x00E9 => ["/eacute", 444],
|
228
|
+
0x00EA => ["/ecircumflex", 444],
|
229
|
+
0x00EB => ["/edieresis", 444],
|
230
|
+
0x00EC => ["/igrave", 278],
|
231
|
+
0x00ED => ["/iacute", 278],
|
232
|
+
0x00EE => ["/icircumflex", 278],
|
233
|
+
0x00EF => ["/idieresis", 278],
|
234
|
+
0x00F0 => ["/eth", 500],
|
235
|
+
0x00F1 => ["/ntilde", 500],
|
236
|
+
0x00F2 => ["/ograve", 500],
|
237
|
+
0x00F3 => ["/oacute", 500],
|
238
|
+
0x00F4 => ["/ocircumflex", 500],
|
239
|
+
0x00F5 => ["/otilde", 500],
|
240
|
+
0x00F6 => ["/odieresis", 500],
|
241
|
+
0x00F7 => ["/divide", 564],
|
242
|
+
0x00F8 => ["/oslash", 500],
|
243
|
+
0x00F9 => ["/ugrave", 500],
|
244
|
+
0x00FA => ["/uacute", 500],
|
245
|
+
0x00FB => ["/ucircumflex", 500],
|
246
|
+
0x00FC => ["/udieresis", 500],
|
247
|
+
0x00FD => ["/yacute", 500],
|
248
|
+
0x00FE => ["/thorn", 500],
|
249
|
+
0x00FF => ["/ydieresis", 500],
|
250
|
+
0x0131 => ["/dotlessi", 278],
|
251
|
+
0x0141 => ["/Lslash", 611],
|
252
|
+
0x0142 => ["/lslash", 278],
|
253
|
+
0x0152 => ["/OE", 889],
|
254
|
+
0x0153 => ["/oe", 722],
|
255
|
+
0x0160 => ["/Scaron", 556],
|
256
|
+
0x0161 => ["/scaron", 389],
|
257
|
+
0x0178 => ["/Ydieresis", 722],
|
258
|
+
0x017D => ["/Zcaron", 611],
|
259
|
+
0x017E => ["/zcaron", 444],
|
260
|
+
0x0192 => ["/florin", 488],
|
261
|
+
0x02C6 => ["/circumflex", 333],
|
262
|
+
0x02C7 => ["/caron", 333],
|
263
|
+
0x02D8 => ["/breve", 333],
|
264
|
+
0x02D9 => ["/dotaccent", 333],
|
265
|
+
0x02DA => ["/ring", 333],
|
266
|
+
0x02DB => ["/ogonek", 333],
|
267
|
+
0x02DC => ["/tilde", 333],
|
268
|
+
0x02DD => ["/hungarumlaut", 333],
|
269
|
+
0x0394 => ["/Delta", 643],
|
270
|
+
0x0401 => ["/afii10023", 611],
|
271
|
+
0x0402 => ["/afii10051", 752],
|
272
|
+
0x0403 => ["/afii10052", 578],
|
273
|
+
0x0404 => ["/afii10053", 660],
|
274
|
+
0x0405 => ["/afii10054", 556],
|
275
|
+
0x0406 => ["/afii10055", 333],
|
276
|
+
0x0407 => ["/afii10056", 333],
|
277
|
+
0x0408 => ["/afii10057", 389],
|
278
|
+
0x0409 => ["/afii10058", 872],
|
279
|
+
0x040A => ["/afii10059", 872],
|
280
|
+
0x040B => ["/afii10060", 741],
|
281
|
+
0x040C => ["/afii10061", 667],
|
282
|
+
0x040E => ["/afii10062", 708],
|
283
|
+
0x040F => ["/afii10145", 722],
|
284
|
+
0x0410 => ["/afii10017", 722],
|
285
|
+
0x0411 => ["/afii10018", 574],
|
286
|
+
0x0412 => ["/afii10019", 667],
|
287
|
+
0x0413 => ["/afii10020", 578],
|
288
|
+
0x0414 => ["/afii10021", 682],
|
289
|
+
0x0415 => ["/afii10022", 611],
|
290
|
+
0x0416 => ["/afii10024", 896],
|
291
|
+
0x0417 => ["/afii10025", 501],
|
292
|
+
0x0418 => ["/afii10026", 722],
|
293
|
+
0x0419 => ["/afii10027", 722],
|
294
|
+
0x041A => ["/afii10028", 667],
|
295
|
+
0x041B => ["/afii10029", 678],
|
296
|
+
0x041C => ["/afii10030", 889],
|
297
|
+
0x041D => ["/afii10031", 722],
|
298
|
+
0x041E => ["/afii10032", 722],
|
299
|
+
0x041F => ["/afii10033", 722],
|
300
|
+
0x0420 => ["/afii10034", 556],
|
301
|
+
0x0421 => ["/afii10035", 667],
|
302
|
+
0x0422 => ["/afii10036", 611],
|
303
|
+
0x0423 => ["/afii10037", 708],
|
304
|
+
0x0424 => ["/afii10038", 790],
|
305
|
+
0x0425 => ["/afii10039", 722],
|
306
|
+
0x0426 => ["/afii10040", 722],
|
307
|
+
0x0427 => ["/afii10041", 650],
|
308
|
+
0x0428 => ["/afii10042", 1009],
|
309
|
+
0x0429 => ["/afii10043", 1009],
|
310
|
+
0x042A => ["/afii10044", 706],
|
311
|
+
0x042B => ["/afii10045", 872],
|
312
|
+
0x042C => ["/afii10046", 574],
|
313
|
+
0x042D => ["/afii10047", 660],
|
314
|
+
0x042E => ["/afii10048", 1028],
|
315
|
+
0x042F => ["/afii10049", 667],
|
316
|
+
0x0430 => ["/afii10065", 444],
|
317
|
+
0x0431 => ["/afii10066", 509],
|
318
|
+
0x0432 => ["/afii10067", 472],
|
319
|
+
0x0433 => ["/afii10068", 410],
|
320
|
+
0x0434 => ["/afii10069", 509],
|
321
|
+
0x0435 => ["/afii10070", 444],
|
322
|
+
0x0436 => ["/afii10072", 691],
|
323
|
+
0x0437 => ["/afii10073", 395],
|
324
|
+
0x0438 => ["/afii10074", 535],
|
325
|
+
0x0439 => ["/afii10075", 535],
|
326
|
+
0x043A => ["/afii10076", 486],
|
327
|
+
0x043B => ["/afii10077", 499],
|
328
|
+
0x043C => ["/afii10078", 633],
|
329
|
+
0x043D => ["/afii10079", 535],
|
330
|
+
0x043E => ["/afii10080", 500],
|
331
|
+
0x043F => ["/afii10081", 535],
|
332
|
+
0x0440 => ["/afii10082", 500],
|
333
|
+
0x0441 => ["/afii10083", 444],
|
334
|
+
0x0442 => ["/afii10084", 437],
|
335
|
+
0x0443 => ["/afii10085", 500],
|
336
|
+
0x0444 => ["/afii10086", 648],
|
337
|
+
0x0445 => ["/afii10087", 500],
|
338
|
+
0x0446 => ["/afii10088", 535],
|
339
|
+
0x0447 => ["/afii10089", 503],
|
340
|
+
0x0448 => ["/afii10090", 770],
|
341
|
+
0x0449 => ["/afii10091", 770],
|
342
|
+
0x044A => ["/afii10092", 517],
|
343
|
+
0x044B => ["/afii10093", 672],
|
344
|
+
0x044C => ["/afii10094", 456],
|
345
|
+
0x044D => ["/afii10095", 429],
|
346
|
+
0x044E => ["/afii10096", 747],
|
347
|
+
0x044F => ["/afii10097", 460],
|
348
|
+
0x0451 => ["/afii10071", 444],
|
349
|
+
0x0452 => ["/afii10099", 483],
|
350
|
+
0x0453 => ["/afii10100", 410],
|
351
|
+
0x0454 => ["/afii10101", 429],
|
352
|
+
0x0455 => ["/afii10102", 389],
|
353
|
+
0x0456 => ["/afii10103", 278],
|
354
|
+
0x0457 => ["/afii10104", 278],
|
355
|
+
0x0458 => ["/afii10105", 278],
|
356
|
+
0x0459 => ["/afii10106", 727],
|
357
|
+
0x045A => ["/afii10107", 723],
|
358
|
+
0x045B => ["/afii10108", 500],
|
359
|
+
0x045C => ["/afii10109", 486],
|
360
|
+
0x045E => ["/afii10110", 500],
|
361
|
+
0x045F => ["/afii10193", 535],
|
362
|
+
0x0462 => ["/afii10146", 648],
|
363
|
+
0x0463 => ["/afii10194", 514],
|
364
|
+
0x0472 => ["/afii10147", 722],
|
365
|
+
0x0473 => ["/afii10195", 500],
|
366
|
+
0x0474 => ["/afii10148", 771],
|
367
|
+
0x0475 => ["/afii10196", 536],
|
368
|
+
0x0490 => ["/afii10050", 450],
|
369
|
+
0x0491 => ["/afii10098", 351],
|
370
|
+
0x2013 => ["/endash", 500],
|
371
|
+
0x2014 => ["/emdash", 1000],
|
372
|
+
0x2018 => ["/quoteleft", 333],
|
373
|
+
0x2019 => ["/quoteright", 333],
|
374
|
+
0x201A => ["/quotesinglbase", 333],
|
375
|
+
0x201C => ["/quotedblleft", 444],
|
376
|
+
0x201D => ["/quotedblright", 444],
|
377
|
+
0x201E => ["/quotedblbase", 444],
|
378
|
+
0x2020 => ["/dagger", 500],
|
379
|
+
0x2021 => ["/daggerdbl", 500],
|
380
|
+
0x2022 => ["/bullet", 350],
|
381
|
+
0x2026 => ["/ellipsis", 1000],
|
382
|
+
0x2030 => ["/perthousand", 1000],
|
383
|
+
0x2039 => ["/guilsinglleft", 333],
|
384
|
+
0x203A => ["/guilsinglright", 333],
|
385
|
+
0x20AC => ["/Euro", 500],
|
386
|
+
0x2116 => ["/afii61352", 954],
|
387
|
+
0x2122 => ["/trademark", 980],
|
388
|
+
0x2202 => ["/partialdiff", 490],
|
389
|
+
0x2212 => ["/minus", 564],
|
390
|
+
0x221A => ["/radical", 552],
|
391
|
+
0x221E => ["/infinity", 708],
|
392
|
+
0x2248 => ["/approxequal", 564],
|
393
|
+
0x2260 => ["/notequal", 564],
|
394
|
+
0x2264 => ["/lessequal", 564],
|
395
|
+
0x2265 => ["/greaterequal", 564],
|
396
|
+
0xFB01 => ["/fi", 556],
|
397
|
+
0xFB02 => ["/fl", 556],
|
398
|
+
]
|
399
|
+
@chardata.default = proc do |fd, uni|
|
400
|
+
fd[uni] = [ sprintf( "/uni%04X",uni ), 500 ]
|
401
|
+
end
|
402
|
+
|
403
|
+
@encodings = Array.new()
|
404
|
+
@wlists = Array.new()
|
405
|
+
end
|
406
|
+
|
407
|
+
# Return the width of a given UTF-8 string formatted with our hardcoded
|
408
|
+
# font at a given point size
|
409
|
+
def getLineWidth( line,size )
|
410
|
+
w = 0.0
|
411
|
+
line.each_char do |uc|
|
412
|
+
begin
|
413
|
+
w += @chardata[uc.ord][1] * size / 1000.0
|
414
|
+
rescue
|
415
|
+
rawbytes = uc.unpack( 'C*' )
|
416
|
+
bs = ''
|
417
|
+
rawbytes.each{ |b| bs << sprintf( "%02x",b ) }
|
418
|
+
$stderr.puts( "Warning: an invalid UTF-8 sequence (#{bs}) in the hOCR data." )
|
419
|
+
w += ( @chardata[0x003F][1] * size / 1000.0 ) * rawbytes.length
|
420
|
+
end
|
421
|
+
end
|
422
|
+
w.to_f
|
423
|
+
end
|
424
|
+
|
425
|
+
# Take an array of UTF-8 characters and return an array of the
|
426
|
+
# corresponding PostScript glyph names
|
427
|
+
def getEncoding( enc )
|
428
|
+
ret = Array.new()
|
429
|
+
enc.each do |char|
|
430
|
+
ret << @chardata[char.ord][0]
|
431
|
+
end
|
432
|
+
ret
|
433
|
+
end
|
434
|
+
|
435
|
+
# Take an array of UTF-8 characters and return an array of the
|
436
|
+
# corresponding glyph widths
|
437
|
+
def getWidths( enc )
|
438
|
+
ret = Array.new()
|
439
|
+
enc.each do |char|
|
440
|
+
ret << @chardata[char.ord][1]
|
441
|
+
end
|
442
|
+
ret
|
443
|
+
end
|
444
|
+
|
445
|
+
# Take an array of UTF-8 characters and return the corresponding
|
446
|
+
# ToUnicode cmap object
|
447
|
+
def getCMAP( enc )
|
448
|
+
cmap = [
|
449
|
+
"/CIDInit /ProcSet findresource begin\n",
|
450
|
+
"12 dict begin\n",
|
451
|
+
"begincmap\n",
|
452
|
+
"/CIDSystemInfo\n",
|
453
|
+
"<<\n",
|
454
|
+
" /Registry ( PDFBeads )\n",
|
455
|
+
" /Ordering ( Custom )\n",
|
456
|
+
" /Supplement 0\n",
|
457
|
+
">> def\n",
|
458
|
+
"/CMapName /PDFBeads-Custom def\n",
|
459
|
+
"/CMapType 2 def\n",
|
460
|
+
"1 begincodespacerange\n",
|
461
|
+
"<00> <FF>\n",
|
462
|
+
"endcodespacerange\n",
|
463
|
+
].join( '' )
|
464
|
+
ranges = Array.new()
|
465
|
+
cur_range = nil
|
466
|
+
prev = -1
|
467
|
+
numbfchar = 0
|
468
|
+
enc.each_index do |i|
|
469
|
+
cur = enc[i].ord
|
470
|
+
if cur == prev + 1
|
471
|
+
if cur_range.nil?
|
472
|
+
cur_range = Hash[
|
473
|
+
'start' => i-1,
|
474
|
+
'end' => i,
|
475
|
+
'uni' => prev
|
476
|
+
]
|
477
|
+
numbfchar -= 1
|
478
|
+
else
|
479
|
+
cur_range['end'] = i
|
480
|
+
end
|
481
|
+
elsif cur_range != nil
|
482
|
+
ranges << cur_range
|
483
|
+
cur_range = nil
|
484
|
+
end
|
485
|
+
|
486
|
+
if cur_range.nil? and cur != -1
|
487
|
+
numbfchar += 1
|
488
|
+
end
|
489
|
+
prev = cur
|
490
|
+
end
|
491
|
+
|
492
|
+
unless cur_range.nil?
|
493
|
+
ranges << cur_range
|
494
|
+
cur_range = nil
|
495
|
+
end
|
496
|
+
|
497
|
+
if ranges.length > 0
|
498
|
+
cmap << "#{ranges.length} beginbfrange\n"
|
499
|
+
ranges.each do |cr|
|
500
|
+
cmap << sprintf( "<%02X> <%02X> <%04X>\n",
|
501
|
+
cr['start'], cr['end'], cr['uni'] )
|
502
|
+
end
|
503
|
+
cmap << "endbfrange\n"
|
504
|
+
end
|
505
|
+
|
506
|
+
if numbfchar > 0
|
507
|
+
cmap += "%d beginbfchar\n" % numbfchar
|
508
|
+
enc.each_index do |i|
|
509
|
+
in_range = false
|
510
|
+
ranges.each do |cr|
|
511
|
+
if i >= cr['start'] and i <= cr['end']
|
512
|
+
in_range = true
|
513
|
+
break
|
514
|
+
end
|
515
|
+
end
|
516
|
+
|
517
|
+
cmap << sprintf( "<%02X> <%04X>\n", i, enc[i].ord ) unless in_range
|
518
|
+
end
|
519
|
+
|
520
|
+
cmap << "endbfchar\n"
|
521
|
+
end
|
522
|
+
|
523
|
+
cmap << "endcmap\n"
|
524
|
+
cmap << "CMapName currentdict /CMap defineresource pop\n"
|
525
|
+
cmap << "end\n"
|
526
|
+
cmap << "end\n"
|
527
|
+
|
528
|
+
toUnicode = PDFBuilder::XObj.new( Hash[
|
529
|
+
'Filter' => '/FlateDecode',
|
530
|
+
], Zlib::Deflate.deflate( cmap,9 ) )
|
531
|
+
toUnicode
|
532
|
+
end
|
533
|
+
end
|