rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,169 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
|
24
|
+
|
25
|
+
require 'zlib'
|
26
|
+
require 'rpdf2txt/object'
|
27
|
+
require 'rpdf2txt/default_handler'
|
28
|
+
require 'md5'
|
29
|
+
|
30
|
+
module Rpdf2txt
|
31
|
+
VERSION = '0.8.2'
|
32
|
+
class Parser
|
33
|
+
attr_accessor :encrypt
|
34
|
+
def initialize(pdf_stream, target_encoding='utf8')
|
35
|
+
@encrypt_id = nil
|
36
|
+
@src = pdf_stream
|
37
|
+
@object_catalogue = nil
|
38
|
+
@target_encoding = target_encoding
|
39
|
+
end
|
40
|
+
def extract_text(callback_handler = SimpleHandler.new)
|
41
|
+
page_tree.each { |node|
|
42
|
+
node.text(callback_handler)
|
43
|
+
callback_handler.send_page
|
44
|
+
}
|
45
|
+
callback_handler.send_eof
|
46
|
+
end
|
47
|
+
def object_catalogue
|
48
|
+
@object_catalogue ||= build_object_catalogue()
|
49
|
+
end
|
50
|
+
def page_tree
|
51
|
+
@page_tree ||= build_page_tree()
|
52
|
+
end
|
53
|
+
# helper methods
|
54
|
+
def build_trailer_dictionary
|
55
|
+
@trailer_dictionary = @object_catalogue.values.find do |obj|
|
56
|
+
obj.is_a?(TrailerDictionary)
|
57
|
+
end
|
58
|
+
startobj = 0
|
59
|
+
endobj = 0
|
60
|
+
while(endobj && (startobj = @src.index(/\btrailer/n, endobj)))
|
61
|
+
if(endobj = @src.index(/startxref/n, startobj))
|
62
|
+
endobj+= 8
|
63
|
+
trailer_src = @src[startobj..endobj]
|
64
|
+
trailer_dictionary = TrailerDictionary.new(trailer_src, @target_encoding)
|
65
|
+
if(@trailer_dictionary.nil?)
|
66
|
+
@trailer_dictionary = trailer_dictionary
|
67
|
+
else
|
68
|
+
@trailer_dictionary.update(trailer_dictionary)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
if @trailer_dictionary.nil? \
|
73
|
+
&& match = /startxref\s*(\d+)\s*%%EOF/m.match(@src)
|
74
|
+
startobj = match[1].to_i
|
75
|
+
endobj = @src.index(/endobj/n, startobj) + 6
|
76
|
+
xref_src = @src[startobj...endobj]
|
77
|
+
@trailer_dictionary = TrailerDictionary.new(xref_src, @target_encoding)
|
78
|
+
end
|
79
|
+
if (@encrypt_id = @trailer_dictionary.encrypt_id) \
|
80
|
+
&& (obj = @object_catalogue[@encrypt_id])
|
81
|
+
@encrypt = PdfEncrypt.new(obj.src)
|
82
|
+
@encrypt.file_id = @trailer_dictionary.file_id
|
83
|
+
@object_catalogue.each_value do |obj|
|
84
|
+
obj.decoder = @encrypt
|
85
|
+
end
|
86
|
+
end
|
87
|
+
@trailer_dictionary
|
88
|
+
end
|
89
|
+
def trailer_dictionary
|
90
|
+
@trailer_dictionary ||= self.build_trailer_dictionary
|
91
|
+
end
|
92
|
+
private
|
93
|
+
def build_object(src)
|
94
|
+
case src
|
95
|
+
when /\/Type\s*\/Catalog\b/n
|
96
|
+
CatalogNode.new(src, @target_encoding)
|
97
|
+
when /\/Type\s*\/Pages\b/n
|
98
|
+
PageNode.new(src, @target_encoding)
|
99
|
+
when /\/Type\s*\/Page\b/n
|
100
|
+
PageLeaf.new(src, @target_encoding)
|
101
|
+
when /\/Type\s*\/Font\b/n
|
102
|
+
Font.new(src, @target_encoding)
|
103
|
+
when /\/Type\s*\/FontDescriptor\b/n
|
104
|
+
FontDescriptor.new(src, @target_encoding)
|
105
|
+
when /\/Type\s*\/Encoding\b/n
|
106
|
+
Encoding.new(src, @target_encoding)
|
107
|
+
when /\/Type\s*\/ObjStm\b/n
|
108
|
+
ObjStream.new(src, @target_encoding)
|
109
|
+
when /\/Type\s*\/XRef\b/n
|
110
|
+
TrailerDictionary.new(src, @target_encoding)
|
111
|
+
when %r!/Subtype\s*/Image!n
|
112
|
+
Image.new(src, @target_encoding)
|
113
|
+
when /\bstream\b/n, %r{/ToUnicode\b}n
|
114
|
+
Stream.new(src, @target_encoding)
|
115
|
+
when /\/Font\s*<</mn
|
116
|
+
Resource.new(src, @target_encoding)
|
117
|
+
when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn
|
118
|
+
ReferenceArray.new(src, @target_encoding)
|
119
|
+
when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn
|
120
|
+
PdfArray.new(src, @target_encoding)
|
121
|
+
when /obj\s*<</mn
|
122
|
+
PdfHash.new(src, @target_encoding)
|
123
|
+
else
|
124
|
+
Unknown.new(src, @target_encoding)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
def build_object_catalogue
|
128
|
+
startobj=0
|
129
|
+
endobj=0
|
130
|
+
catalogue = {}
|
131
|
+
@src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
|
132
|
+
obj = build_object(match.to_s)
|
133
|
+
catalogue.store(obj.oid, obj)
|
134
|
+
end
|
135
|
+
catalogue
|
136
|
+
end
|
137
|
+
def rebuild_object_catalogue
|
138
|
+
object_catalogue.values.select do |obj|
|
139
|
+
obj.is_a?(ObjStream)
|
140
|
+
end.each do |obj|
|
141
|
+
scan_object_stream obj.decoded_stream, object_catalogue
|
142
|
+
end
|
143
|
+
end
|
144
|
+
def build_page_tree
|
145
|
+
page_tree_root.build_tree(object_catalogue)
|
146
|
+
end
|
147
|
+
def scan_object_stream src, catalogue
|
148
|
+
match = /^(?<pairs>(\d+\s+\d+\s+?)+)(?<objects>.*)/.match src
|
149
|
+
pairs, objects = match[:pairs], match[:objects]
|
150
|
+
offsets = pairs.scan(/(\d+)\s+(\d+)/).collect do |obj_id, offset|
|
151
|
+
[obj_id.to_i, offset.to_i]
|
152
|
+
end
|
153
|
+
offsets.each_with_index do |(obj_id, offset), idx|
|
154
|
+
nxt_id, nxt_offset = offsets[idx.next]
|
155
|
+
obj_src = sprintf "%i 0 obj %s endobj", obj_id,
|
156
|
+
objects[offset...(nxt_offset || src.length)]
|
157
|
+
obj = build_object(obj_src)
|
158
|
+
catalogue.store(obj.oid, obj)
|
159
|
+
end
|
160
|
+
catalogue
|
161
|
+
end
|
162
|
+
def page_tree_root
|
163
|
+
catalogue = object_catalogue
|
164
|
+
trailer = trailer_dictionary
|
165
|
+
rebuild_object_catalogue
|
166
|
+
catalogue[trailer.root_id]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,408 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Symbol -- Rpdf2txt -- 27.04.2007 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
module Rpdf2txt
|
5
|
+
module Symbol
|
6
|
+
NAMES = {
|
7
|
+
"Alpha" => 0101, # Α
|
8
|
+
"Beta" => 0102, # Β
|
9
|
+
"Chi" => 0103, # Χ
|
10
|
+
"Delta" => 0104, # Δ
|
11
|
+
"Epsilon" => 0105, # Ε
|
12
|
+
"Eta" => 0110, # Η
|
13
|
+
"Euro" => 0240, # €
|
14
|
+
"Gamma" => 0107, # Γ
|
15
|
+
"Ifraktur" => 0301, # ℑ
|
16
|
+
"Iota" => 0111, # Ι
|
17
|
+
"Kappa" => 0113, # Κ
|
18
|
+
"Lambda" => 0114, # Λ
|
19
|
+
"Mu" => 0115, # Μ
|
20
|
+
"Nu" => 0116, # Ν
|
21
|
+
"Omega" => 0127, # Ω
|
22
|
+
"Omicron" => 0117, # Ο
|
23
|
+
"Phi" => 0106, # Φ
|
24
|
+
"Pi" => 0120, # Π
|
25
|
+
"Psi" => 0131, # Ψ
|
26
|
+
"Rfraktur" => 0302, # ℜ
|
27
|
+
"Rho" => 0122, # Ρ
|
28
|
+
"Sigma" => 0123, # Σ
|
29
|
+
"Tau" => 0124, # Τ
|
30
|
+
"Theta" => 0121, # Θ
|
31
|
+
"Upsilon" => 0125, # Υ
|
32
|
+
"Upsilon1" => 0241, # ϒ
|
33
|
+
"Xi" => 0130, # Ξ
|
34
|
+
"Zeta" => 0132, # Ζ
|
35
|
+
"aleph" => 0300, # ℵ
|
36
|
+
"alpha" => 0141, # α
|
37
|
+
"ampersand" => 0046, # &
|
38
|
+
"angle" => 0320, # ∠
|
39
|
+
"angleleft" => 0341, # 〈
|
40
|
+
"angleright" => 0361, # 〉
|
41
|
+
"approxequal" => 0273, # ≈
|
42
|
+
"arrowboth" => 0253, # ↔
|
43
|
+
"arrowdblboth" => 0333, # ⇔
|
44
|
+
"arrowdbldown" => 0337, # ⇓
|
45
|
+
"arrowdblleft" => 0334, # ⇐
|
46
|
+
"arrowdblright" => 0336, # ⇒
|
47
|
+
"arrowdblup" => 0335, # ⇑
|
48
|
+
"arrowdown" => 0257, # ↓
|
49
|
+
"arrowhorizex" => 0276, #
|
50
|
+
"arrowleft" => 0254, # ←
|
51
|
+
"arrowright" => 0256, # →
|
52
|
+
"arrowup" => 0255, # ↑
|
53
|
+
"arrowvertex" => 0275, #
|
54
|
+
"asteriskmath" => 0052, # ∗
|
55
|
+
"bar" => 0174, # |
|
56
|
+
"beta" => 0142, # β
|
57
|
+
"braceleft" => 0173, # {
|
58
|
+
"braceright" => 0175, # }
|
59
|
+
"bracelefttp" => 0354, #
|
60
|
+
"braceleftmid" => 0355, #
|
61
|
+
"braceleftbt" => 0356, #
|
62
|
+
"bracerighttp" => 0374, #
|
63
|
+
"bracerightmid" => 0375, #
|
64
|
+
"bracerightbt" => 0376, #
|
65
|
+
"braceex" => 0357, #
|
66
|
+
"bracketleft" => 0133, # [
|
67
|
+
"bracketright" => 0135, # ]
|
68
|
+
"bracketlefttp" => 0351, #
|
69
|
+
"bracketleftex" => 0352, #
|
70
|
+
"bracketleftbt" => 0353, #
|
71
|
+
"bracketrighttp" => 0371, #
|
72
|
+
"bracketrightex" => 0372, #
|
73
|
+
"bracketrightbt" => 0373, #
|
74
|
+
"bullet" => 0267, # •
|
75
|
+
"carriagereturn" => 0277, # ↵
|
76
|
+
"chi" => 0143, # χ
|
77
|
+
"circlemultiply" => 0304, # ⊗
|
78
|
+
"circleplus" => 0305, # ⊕
|
79
|
+
"club" => 0247, # ♣
|
80
|
+
"colon" => 0072, # :
|
81
|
+
"comma" => 0054, # ,
|
82
|
+
"congruent" => 0100, # ≅
|
83
|
+
"copyrightsans" => 0343, #
|
84
|
+
"copyrightserif" => 0323,
|
85
|
+
"degree" => 0260, # °
|
86
|
+
"delta" => 0144, # δ
|
87
|
+
"diamond" => 0250, # ♦
|
88
|
+
"divide" => 0270, # ÷
|
89
|
+
"dotmath" => 0327, # ⋅
|
90
|
+
"eight" => 0070, # 8,
|
91
|
+
"element" => 0316, # ∈
|
92
|
+
"ellipsis" => 0274, # …
|
93
|
+
"emptyset" => 0306, # ∅
|
94
|
+
"epsilon" => 0145, # ε
|
95
|
+
"equal" => 0075, # =
|
96
|
+
"equivalence" => 0272, # ≡
|
97
|
+
"eta" => 0150, # η
|
98
|
+
"exclam" => 0041, # !
|
99
|
+
"existential" => 0044, # ∃
|
100
|
+
"five" => 0065, # 5,
|
101
|
+
"florin" => 0246, # ƒ
|
102
|
+
"four" => 0064, # 4,
|
103
|
+
"fraction" => 0244, # ⁄
|
104
|
+
"gamma" => 0147, # γ
|
105
|
+
"gradient" => 0321, # ∇
|
106
|
+
"greater" => 0076, # >
|
107
|
+
"greaterequal" => 0263, # ≥
|
108
|
+
"heart" => 0251, # ♥
|
109
|
+
"infinity" => 0245, # ∞
|
110
|
+
"integral" => 0362, # ∫
|
111
|
+
"integraltp" => 0363, # ⌠
|
112
|
+
"integralex" => 0364, #
|
113
|
+
"integralbt" => 0365, # ⌡
|
114
|
+
"intersection" => 0307, # ∩
|
115
|
+
"iota" => 0151, # ι
|
116
|
+
"kappa" => 0153, # κ
|
117
|
+
"lambda" => 0154, # λ
|
118
|
+
"less" => 0074, # <
|
119
|
+
"lessequal" => 0243, # ≤
|
120
|
+
"logicaland" => 0331, # ∧
|
121
|
+
"logicalnot" => 0330, # ¬
|
122
|
+
"logicalor" => 0332, # ∨
|
123
|
+
"lozenge" => 0340, # ◊
|
124
|
+
"minus" => 0055, # −
|
125
|
+
"minute" => 0242, # ′
|
126
|
+
"mu" => 0155, # μ
|
127
|
+
"multiply" => 0264, # ×
|
128
|
+
"nine" => 0071, # 9,
|
129
|
+
"notelement" => 0317, # ∉
|
130
|
+
"notequal" => 0271, # ≠
|
131
|
+
"notsubset" => 0313, # ⊄
|
132
|
+
"nu" => 0156, # ν
|
133
|
+
"numbersign" => 0043, # #
|
134
|
+
"omega" => 0167, # ω
|
135
|
+
"omega1" => 0166, # ϖ
|
136
|
+
"omicron" => 0157, # ο
|
137
|
+
"one" => 0061, # 1,
|
138
|
+
"parenleft" => 0050, # (
|
139
|
+
"parenright" => 0051, # )
|
140
|
+
"parenlefttp" => 0346, #
|
141
|
+
"parenleftex" => 0347, #
|
142
|
+
"parenleftbt" => 0350, #
|
143
|
+
"parenrighttp" => 0366, #
|
144
|
+
"parenrightex" => 0367, #
|
145
|
+
"parenrightbt" => 0370, #
|
146
|
+
"partialdiff" => 0266, # ∂
|
147
|
+
"percent" => 0045, # %
|
148
|
+
"period" => 0056, # .
|
149
|
+
"perpendicular" => 0136, # ⊥
|
150
|
+
"phi" => 0146, # φ
|
151
|
+
"phi1" => 0152,
|
152
|
+
"pi" => 0160, # π
|
153
|
+
"plus" => 0053, # +
|
154
|
+
"plusminus" => 0261, # ±
|
155
|
+
"product" => 0325, # Π
|
156
|
+
"propersubset" => 0314, # ⊂
|
157
|
+
"propersuperset" => 0311, # ⊃
|
158
|
+
"proportional" => 0265, # ∝
|
159
|
+
"psi" => 0171, # ψ
|
160
|
+
"question" => 0077, # ?
|
161
|
+
"radical" => 0326, # √
|
162
|
+
"radicalex" => 0140,
|
163
|
+
"reflexsubset" => 0315, # ⊆
|
164
|
+
"reflexsuperset" => 0312, # ⊇
|
165
|
+
"registersans" => 0342, #
|
166
|
+
"registerserif" => 0322,
|
167
|
+
"rho" => 0162, # ρ
|
168
|
+
"second" => 0262, # ″
|
169
|
+
"semicolon" => 0073, # ;
|
170
|
+
"seven" => 0067, # 7,
|
171
|
+
"sigma" => 0163, # σ
|
172
|
+
"sigma1" => 0126, # ς
|
173
|
+
"similar" => 0176, # ∼
|
174
|
+
"six" => 0066, # 6,
|
175
|
+
"slash" => 0057, # /
|
176
|
+
"space" => 0040,
|
177
|
+
"spade" => 0252, # ♠
|
178
|
+
"suchthat" => 0047, # ∋
|
179
|
+
"summation" => 0345, # Σ
|
180
|
+
"tau" => 0164, # τ
|
181
|
+
"therefore" => 0134, # ∴
|
182
|
+
"theta" => 0161, # θ
|
183
|
+
"theta1" => 0112,
|
184
|
+
"three" => 0063, # 3,
|
185
|
+
"trademarksans" => 0344, #
|
186
|
+
"trademarkserif" => 0324,
|
187
|
+
"two" => 0062, # 2,
|
188
|
+
"underscore" => 0137, # _
|
189
|
+
"union" => 0310, # ∪
|
190
|
+
"universal" => 0042, # ∀
|
191
|
+
"upsilon" => 0165, # υ
|
192
|
+
"weierstrass" => 0303, # ℘
|
193
|
+
"xi" => 0170, # ξ
|
194
|
+
"zero" => 0060, # 0,
|
195
|
+
"zeta" => 0172, # ζ
|
196
|
+
}
|
197
|
+
UNICODE_MAP = { # based on http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/SYMBOL.TXT
|
198
|
+
0x20 => "\x00\x20", # SPACE
|
199
|
+
0x21 => "\x00\x21", # EXCLAMATION MARK
|
200
|
+
0x22 => "\x22\x00", # FOR ALL
|
201
|
+
0x23 => "\x00\x23", # NUMBER SIGN
|
202
|
+
0x24 => "\x22\x03", # THERE EXISTS
|
203
|
+
0x25 => "\x00\x25", # PERCENT SIGN
|
204
|
+
0x26 => "\x00\x26", # AMPERSAND
|
205
|
+
0x27 => "\x22\x0D", # SMALL CONTAINS AS MEMBER
|
206
|
+
0x28 => "\x00\x28", # LEFT PARENTHESIS
|
207
|
+
0x29 => "\x00\x29", # RIGHT PARENTHESIS
|
208
|
+
0x2A => "\x22\x17", # ASTERISK OPERATOR
|
209
|
+
0x2B => "\x00\x2B", # PLUS SIGN
|
210
|
+
0x2C => "\x00\x2C", # COMMA
|
211
|
+
0x2D => "\x22\x12", # MINUS SIGN
|
212
|
+
0x2E => "\x00\x2E", # FULL STOP
|
213
|
+
0x2F => "\x00\x2F", # SOLIDUS
|
214
|
+
0x30 => "\x00\x30", # DIGIT ZERO
|
215
|
+
0x31 => "\x00\x31", # DIGIT ONE
|
216
|
+
0x32 => "\x00\x32", # DIGIT TWO
|
217
|
+
0x33 => "\x00\x33", # DIGIT THREE
|
218
|
+
0x34 => "\x00\x34", # DIGIT FOUR
|
219
|
+
0x35 => "\x00\x35", # DIGIT FIVE
|
220
|
+
0x36 => "\x00\x36", # DIGIT SIX
|
221
|
+
0x37 => "\x00\x37", # DIGIT SEVEN
|
222
|
+
0x38 => "\x00\x38", # DIGIT EIGHT
|
223
|
+
0x39 => "\x00\x39", # DIGIT NINE
|
224
|
+
0x3A => "\x00\x3A", # COLON
|
225
|
+
0x3B => "\x00\x3B", # SEMICOLON
|
226
|
+
0x3C => "\x00\x3C", # LESS-THAN SIGN
|
227
|
+
0x3D => "\x00\x3D", # EQUALS SIGN
|
228
|
+
0x3E => "\x00\x3E", # GREATER-THAN SIGN
|
229
|
+
0x3F => "\x00\x3F", # QUESTION MARK
|
230
|
+
0x40 => "\x22\x45", # APPROXIMATELY EQUAL TO
|
231
|
+
0x41 => "\x03\x91", # GREEK CAPITAL LETTER ALPHA
|
232
|
+
0x42 => "\x03\x92", # GREEK CAPITAL LETTER BETA
|
233
|
+
0x43 => "\x03\xA7", # GREEK CAPITAL LETTER CHI
|
234
|
+
0x44 => "\x03\x94", # GREEK CAPITAL LETTER DELTA
|
235
|
+
0x45 => "\x03\x95", # GREEK CAPITAL LETTER EPSILON
|
236
|
+
0x46 => "\x03\xA6", # GREEK CAPITAL LETTER PHI
|
237
|
+
0x47 => "\x03\x93", # GREEK CAPITAL LETTER GAMMA
|
238
|
+
0x48 => "\x03\x97", # GREEK CAPITAL LETTER ETA
|
239
|
+
0x49 => "\x03\x99", # GREEK CAPITAL LETTER IOTA
|
240
|
+
0x4A => "\x03\xD1", # GREEK THETA SYMBOL
|
241
|
+
0x4B => "\x03\x9A", # GREEK CAPITAL LETTER KAPPA
|
242
|
+
0x4C => "\x03\x9B", # GREEK CAPITAL LETTER LAMDA
|
243
|
+
0x4D => "\x03\x9C", # GREEK CAPITAL LETTER MU
|
244
|
+
0x4E => "\x03\x9D", # GREEK CAPITAL LETTER NU
|
245
|
+
0x4F => "\x03\x9F", # GREEK CAPITAL LETTER OMICRON
|
246
|
+
0x50 => "\x03\xA0", # GREEK CAPITAL LETTER PI
|
247
|
+
0x51 => "\x03\x98", # GREEK CAPITAL LETTER THETA
|
248
|
+
0x52 => "\x03\xA1", # GREEK CAPITAL LETTER RHO
|
249
|
+
0x53 => "\x03\xA3", # GREEK CAPITAL LETTER SIGMA
|
250
|
+
0x54 => "\x03\xA4", # GREEK CAPITAL LETTER TAU
|
251
|
+
0x55 => "\x03\xA5", # GREEK CAPITAL LETTER UPSILON
|
252
|
+
0x56 => "\x03\xC2", # GREEK SMALL LETTER FINAL SIGMA
|
253
|
+
0x57 => "\x03\xA9", # GREEK CAPITAL LETTER OMEGA
|
254
|
+
0x58 => "\x03\x9E", # GREEK CAPITAL LETTER XI
|
255
|
+
0x59 => "\x03\xA8", # GREEK CAPITAL LETTER PSI
|
256
|
+
0x5A => "\x03\x96", # GREEK CAPITAL LETTER ZETA
|
257
|
+
0x5B => "\x00\x5B", # LEFT SQUARE BRACKET
|
258
|
+
0x5C => "\x22\x34", # THEREFORE
|
259
|
+
0x5D => "\x00\x5D", # RIGHT SQUARE BRACKET
|
260
|
+
0x5E => "\x22\xA5", # UP TACK
|
261
|
+
0x5F => "\x00\x5F", # LOW LINE
|
262
|
+
0x60 => "\xF8\xE5", # radical extender # corporate char
|
263
|
+
0x61 => "\x03\xB1", # GREEK SMALL LETTER ALPHA
|
264
|
+
0x62 => "\x03\xB2", # GREEK SMALL LETTER BETA
|
265
|
+
0x63 => "\x03\xC7", # GREEK SMALL LETTER CHI
|
266
|
+
0x64 => "\x03\xB4", # GREEK SMALL LETTER DELTA
|
267
|
+
0x65 => "\x03\xB5", # GREEK SMALL LETTER EPSILON
|
268
|
+
0x66 => "\x03\xC6", # GREEK SMALL LETTER PHI
|
269
|
+
0x67 => "\x03\xB3", # GREEK SMALL LETTER GAMMA
|
270
|
+
0x68 => "\x03\xB7", # GREEK SMALL LETTER ETA
|
271
|
+
0x69 => "\x03\xB9", # GREEK SMALL LETTER IOTA
|
272
|
+
0x6A => "\x03\xD5", # GREEK PHI SYMBOL
|
273
|
+
0x6B => "\x03\xBA", # GREEK SMALL LETTER KAPPA
|
274
|
+
0x6C => "\x03\xBB", # GREEK SMALL LETTER LAMDA
|
275
|
+
0x6D => "\x03\xBC", # GREEK SMALL LETTER MU
|
276
|
+
0x6E => "\x03\xBD", # GREEK SMALL LETTER NU
|
277
|
+
0x6F => "\x03\xBF", # GREEK SMALL LETTER OMICRON
|
278
|
+
0x70 => "\x03\xC0", # GREEK SMALL LETTER PI
|
279
|
+
0x71 => "\x03\xB8", # GREEK SMALL LETTER THETA
|
280
|
+
0x72 => "\x03\xC1", # GREEK SMALL LETTER RHO
|
281
|
+
0x73 => "\x03\xC3", # GREEK SMALL LETTER SIGMA
|
282
|
+
0x74 => "\x03\xC4", # GREEK SMALL LETTER TAU
|
283
|
+
0x75 => "\x03\xC5", # GREEK SMALL LETTER UPSILON
|
284
|
+
0x76 => "\x03\xD6", # GREEK PI SYMBOL
|
285
|
+
0x77 => "\x03\xC9", # GREEK SMALL LETTER OMEGA
|
286
|
+
0x78 => "\x03\xBE", # GREEK SMALL LETTER XI
|
287
|
+
0x79 => "\x03\xC8", # GREEK SMALL LETTER PSI
|
288
|
+
0x7A => "\x03\xB6", # GREEK SMALL LETTER ZETA
|
289
|
+
0x7B => "\x00\x7B", # LEFT CURLY BRACKET
|
290
|
+
0x7C => "\x00\x7C", # VERTICAL LINE
|
291
|
+
0x7D => "\x00\x7D", # RIGHT CURLY BRACKET
|
292
|
+
0x7E => "\x22\x3C", # TILDE OPERATOR
|
293
|
+
0xA0 => "\x20\xAC", # EURO SIGN
|
294
|
+
0xA1 => "\x03\xD2", # GREEK UPSILON WITH HOOK SYMBOL
|
295
|
+
0xA2 => "\x20\x32", # PRIME # minute
|
296
|
+
0xA3 => "\x22\x64", # LESS-THAN OR EQUAL TO
|
297
|
+
0xA4 => "\x20\x44", # FRACTION SLASH
|
298
|
+
0xA5 => "\x22\x1E", # INFINITY
|
299
|
+
0xA6 => "\x01\x92", # LATIN SMALL LETTER F WITH HOOK
|
300
|
+
0xA7 => "\x26\x63", # BLACK CLUB SUIT
|
301
|
+
0xA8 => "\x26\x66", # BLACK DIAMOND SUIT
|
302
|
+
0xA9 => "\x26\x65", # BLACK HEART SUIT
|
303
|
+
0xAA => "\x26\x60", # BLACK SPADE SUIT
|
304
|
+
0xAB => "\x21\x94", # LEFT RIGHT ARROW
|
305
|
+
0xAC => "\x21\x90", # LEFTWARDS ARROW
|
306
|
+
0xAD => "\x21\x91", # UPWARDS ARROW
|
307
|
+
0xAE => "\x21\x92", # RIGHTWARDS ARROW
|
308
|
+
0xAF => "\x21\x93", # DOWNWARDS ARROW
|
309
|
+
0xB0 => "\x00\xB0", # DEGREE SIGN
|
310
|
+
0xB1 => "\x00\xB1", # PLUS-MINUS SIGN
|
311
|
+
0xB2 => "\x20\x33", # DOUBLE PRIME # second
|
312
|
+
0xB3 => "\x22\x65", # GREATER-THAN OR EQUAL TO
|
313
|
+
0xB4 => "\x00\xD7", # MULTIPLICATION SIGN
|
314
|
+
0xB5 => "\x22\x1D", # PROPORTIONAL TO
|
315
|
+
0xB6 => "\x22\x02", # PARTIAL DIFFERENTIAL
|
316
|
+
0xB7 => "\x20\x22", # BULLET
|
317
|
+
0xB8 => "\x00\xF7", # DIVISION SIGN
|
318
|
+
0xB9 => "\x22\x60", # NOT EQUAL TO
|
319
|
+
0xBA => "\x22\x61", # IDENTICAL TO
|
320
|
+
0xBB => "\x22\x48", # ALMOST EQUAL TO
|
321
|
+
0xBC => "\x20\x26", # HORIZONTAL ELLIPSIS
|
322
|
+
0xBD => "\x23\xD0", # VERTICAL LINE EXTENSION (for arrows) # for Unicode 4.0 and later
|
323
|
+
0xBE => "\x23\xAF", # HORIZONTAL LINE EXTENSION (for arrows) # for Unicode 3.2 and later
|
324
|
+
0xBF => "\x21\xB5", # DOWNWARDS ARROW WITH CORNER LEFTWARDS
|
325
|
+
0xC0 => "\x21\x35", # ALEF SYMBOL
|
326
|
+
0xC1 => "\x21\x11", # BLACK-LETTER CAPITAL I
|
327
|
+
0xC2 => "\x21\x1C", # BLACK-LETTER CAPITAL R
|
328
|
+
0xC3 => "\x21\x18", # SCRIPT CAPITAL P
|
329
|
+
0xC4 => "\x22\x97", # CIRCLED TIMES
|
330
|
+
0xC5 => "\x22\x95", # CIRCLED PLUS
|
331
|
+
0xC6 => "\x22\x05", # EMPTY SET
|
332
|
+
0xC7 => "\x22\x29", # INTERSECTION
|
333
|
+
0xC8 => "\x22\x2A", # UNION
|
334
|
+
0xC9 => "\x22\x83", # SUPERSET OF
|
335
|
+
0xCA => "\x22\x87", # SUPERSET OF OR EQUAL TO
|
336
|
+
0xCB => "\x22\x84", # NOT A SUBSET OF
|
337
|
+
0xCC => "\x22\x82", # SUBSET OF
|
338
|
+
0xCD => "\x22\x86", # SUBSET OF OR EQUAL TO
|
339
|
+
0xCE => "\x22\x08", # ELEMENT OF
|
340
|
+
0xCF => "\x22\x09", # NOT AN ELEMENT OF
|
341
|
+
0xD0 => "\x22\x20", # ANGLE
|
342
|
+
0xD1 => "\x22\x07", # NABLA
|
343
|
+
0xD2 => "\x00\xAE", # REGISTERED SIGN # serif
|
344
|
+
0xD3 => "\x00\xA9", # COPYRIGHT SIGN # serif
|
345
|
+
0xD4 => "\x21\x22", # TRADE MARK SIGN # serif
|
346
|
+
0xD5 => "\x22\x0F", # N-ARY PRODUCT
|
347
|
+
0xD6 => "\x22\x1A", # SQUARE ROOT
|
348
|
+
0xD7 => "\x22\xC5", # DOT OPERATOR
|
349
|
+
0xD8 => "\x00\xAC", # NOT SIGN
|
350
|
+
0xD9 => "\x22\x27", # LOGICAL AND
|
351
|
+
0xDA => "\x22\x28", # LOGICAL OR
|
352
|
+
0xDB => "\x21\xD4", # LEFT RIGHT DOUBLE ARROW
|
353
|
+
0xDC => "\x21\xD0", # LEFTWARDS DOUBLE ARROW
|
354
|
+
0xDD => "\x21\xD1", # UPWARDS DOUBLE ARROW
|
355
|
+
0xDE => "\x21\xD2", # RIGHTWARDS DOUBLE ARROW
|
356
|
+
0xDF => "\x21\xD3", # DOWNWARDS DOUBLE ARROW
|
357
|
+
0xE0 => "\x25\xCA", # LOZENGE # previously mapped to 0x22C4 DIAMOND OPERATOR
|
358
|
+
0xE1 => "\x30\x08", # LEFT ANGLE BRACKET
|
359
|
+
0xE2 => "\x00\xAE", # REGISTERED SIGN, alternate: sans serif (0xF87F)
|
360
|
+
0xE3 => "\x00\xA9", # COPYRIGHT SIGN, alternate: sans serif (0xF87F)
|
361
|
+
0xE4 => "\x21\x22", # TRADE MARK SIGN, alternate: sans serif (0xF87F)
|
362
|
+
0xE5 => "\x22\x11", # N-ARY SUMMATION
|
363
|
+
0xE6 => "\x23\x9B", # LEFT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
|
364
|
+
0xE7 => "\x23\x9C", # LEFT PARENTHESIS EXTENSION # for Unicode 3.2 and later
|
365
|
+
0xE8 => "\x23\x9D", # LEFT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
|
366
|
+
0xE9 => "\x23\xA1", # LEFT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
|
367
|
+
0xEA => "\x23\xA2", # LEFT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
|
368
|
+
0xEB => "\x23\xA3", # LEFT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
|
369
|
+
0xEC => "\x23\xA7", # LEFT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
|
370
|
+
0xED => "\x23\xA8", # LEFT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
|
371
|
+
0xEE => "\x23\xA9", # LEFT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
|
372
|
+
0xEF => "\x23\xAA", # CURLY BRACKET EXTENSION # for Unicode 3.2 and later
|
373
|
+
0xF0 => "\xF8\xFF", # Apple logo
|
374
|
+
0xF1 => "\x30\x09", # RIGHT ANGLE BRACKET
|
375
|
+
0xF2 => "\x22\x2B", # INTEGRAL
|
376
|
+
0xF3 => "\x23\x20", # TOP HALF INTEGRAL
|
377
|
+
0xF4 => "\x23\xAE", # INTEGRAL EXTENSION # for Unicode 3.2 and later
|
378
|
+
0xF5 => "\x23\x21", # BOTTOM HALF INTEGRAL
|
379
|
+
0xF6 => "\x23\x9E", # RIGHT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
|
380
|
+
0xF7 => "\x23\x9F", # RIGHT PARENTHESIS EXTENSION # for Unicode 3.2 and later
|
381
|
+
0xF8 => "\x23\xA0", # RIGHT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
|
382
|
+
0xF9 => "\x23\xA4", # RIGHT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
|
383
|
+
0xFA => "\x23\xA5", # RIGHT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
|
384
|
+
0xFB => "\x23\xA6", # RIGHT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
|
385
|
+
0xFC => "\x23\xAB", # RIGHT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
|
386
|
+
0xFD => "\x23\xAC", # RIGHT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
|
387
|
+
0xFE => "\x23\xAD", # RIGHT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
|
388
|
+
}
|
389
|
+
SYMBOL_MAP = UNICODE_MAP.invert
|
390
|
+
def Symbol.byte(name)
|
391
|
+
NAMES[name] || SYMBOL_MAP[name]
|
392
|
+
end
|
393
|
+
def Symbol.to_utf16(txt)
|
394
|
+
res = ''
|
395
|
+
txt.each_byte { |byte|
|
396
|
+
res << UNICODE_MAP.fetch(byte, '')
|
397
|
+
}
|
398
|
+
res
|
399
|
+
end
|
400
|
+
def Symbol.from_utf16(txt)
|
401
|
+
res = ''
|
402
|
+
txt.scan(/../n) { |bb|
|
403
|
+
res << SYMBOL_MAP.fetch(bb, '')
|
404
|
+
}
|
405
|
+
res
|
406
|
+
end
|
407
|
+
end
|
408
|
+
end
|