rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
24
+
25
+ require 'zlib'
26
+ require 'rpdf2txt/object'
27
+ require 'rpdf2txt/default_handler'
28
+ require 'md5'
29
+
30
+ module Rpdf2txt
31
+ VERSION = '0.8.2'
32
+ class Parser
33
+ attr_accessor :encrypt
34
+ def initialize(pdf_stream, target_encoding='utf8')
35
+ @encrypt_id = nil
36
+ @src = pdf_stream
37
+ @object_catalogue = nil
38
+ @target_encoding = target_encoding
39
+ end
40
+ def extract_text(callback_handler = SimpleHandler.new)
41
+ page_tree.each { |node|
42
+ node.text(callback_handler)
43
+ callback_handler.send_page
44
+ }
45
+ callback_handler.send_eof
46
+ end
47
+ def object_catalogue
48
+ @object_catalogue ||= build_object_catalogue()
49
+ end
50
+ def page_tree
51
+ @page_tree ||= build_page_tree()
52
+ end
53
+ # helper methods
54
+ def build_trailer_dictionary
55
+ @trailer_dictionary = @object_catalogue.values.find do |obj|
56
+ obj.is_a?(TrailerDictionary)
57
+ end
58
+ startobj = 0
59
+ endobj = 0
60
+ while(endobj && (startobj = @src.index(/\btrailer/n, endobj)))
61
+ if(endobj = @src.index(/startxref/n, startobj))
62
+ endobj+= 8
63
+ trailer_src = @src[startobj..endobj]
64
+ trailer_dictionary = TrailerDictionary.new(trailer_src, @target_encoding)
65
+ if(@trailer_dictionary.nil?)
66
+ @trailer_dictionary = trailer_dictionary
67
+ else
68
+ @trailer_dictionary.update(trailer_dictionary)
69
+ end
70
+ end
71
+ end
72
+ if @trailer_dictionary.nil? \
73
+ && match = /startxref\s*(\d+)\s*%%EOF/m.match(@src)
74
+ startobj = match[1].to_i
75
+ endobj = @src.index(/endobj/n, startobj) + 6
76
+ xref_src = @src[startobj...endobj]
77
+ @trailer_dictionary = TrailerDictionary.new(xref_src, @target_encoding)
78
+ end
79
+ if (@encrypt_id = @trailer_dictionary.encrypt_id) \
80
+ && (obj = @object_catalogue[@encrypt_id])
81
+ @encrypt = PdfEncrypt.new(obj.src)
82
+ @encrypt.file_id = @trailer_dictionary.file_id
83
+ @object_catalogue.each_value do |obj|
84
+ obj.decoder = @encrypt
85
+ end
86
+ end
87
+ @trailer_dictionary
88
+ end
89
+ def trailer_dictionary
90
+ @trailer_dictionary ||= self.build_trailer_dictionary
91
+ end
92
+ private
93
+ def build_object(src)
94
+ case src
95
+ when /\/Type\s*\/Catalog\b/n
96
+ CatalogNode.new(src, @target_encoding)
97
+ when /\/Type\s*\/Pages\b/n
98
+ PageNode.new(src, @target_encoding)
99
+ when /\/Type\s*\/Page\b/n
100
+ PageLeaf.new(src, @target_encoding)
101
+ when /\/Type\s*\/Font\b/n
102
+ Font.new(src, @target_encoding)
103
+ when /\/Type\s*\/FontDescriptor\b/n
104
+ FontDescriptor.new(src, @target_encoding)
105
+ when /\/Type\s*\/Encoding\b/n
106
+ Encoding.new(src, @target_encoding)
107
+ when /\/Type\s*\/ObjStm\b/n
108
+ ObjStream.new(src, @target_encoding)
109
+ when /\/Type\s*\/XRef\b/n
110
+ TrailerDictionary.new(src, @target_encoding)
111
+ when %r!/Subtype\s*/Image!n
112
+ Image.new(src, @target_encoding)
113
+ when /\bstream\b/n, %r{/ToUnicode\b}n
114
+ Stream.new(src, @target_encoding)
115
+ when /\/Font\s*<</mn
116
+ Resource.new(src, @target_encoding)
117
+ when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn
118
+ ReferenceArray.new(src, @target_encoding)
119
+ when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn
120
+ PdfArray.new(src, @target_encoding)
121
+ when /obj\s*<</mn
122
+ PdfHash.new(src, @target_encoding)
123
+ else
124
+ Unknown.new(src, @target_encoding)
125
+ end
126
+ end
127
+ def build_object_catalogue
128
+ startobj=0
129
+ endobj=0
130
+ catalogue = {}
131
+ @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
132
+ obj = build_object(match.to_s)
133
+ catalogue.store(obj.oid, obj)
134
+ end
135
+ catalogue
136
+ end
137
+ def rebuild_object_catalogue
138
+ object_catalogue.values.select do |obj|
139
+ obj.is_a?(ObjStream)
140
+ end.each do |obj|
141
+ scan_object_stream obj.decoded_stream, object_catalogue
142
+ end
143
+ end
144
+ def build_page_tree
145
+ page_tree_root.build_tree(object_catalogue)
146
+ end
147
+ def scan_object_stream src, catalogue
148
+ match = /^(?<pairs>(\d+\s+\d+\s+?)+)(?<objects>.*)/.match src
149
+ pairs, objects = match[:pairs], match[:objects]
150
+ offsets = pairs.scan(/(\d+)\s+(\d+)/).collect do |obj_id, offset|
151
+ [obj_id.to_i, offset.to_i]
152
+ end
153
+ offsets.each_with_index do |(obj_id, offset), idx|
154
+ nxt_id, nxt_offset = offsets[idx.next]
155
+ obj_src = sprintf "%i 0 obj %s endobj", obj_id,
156
+ objects[offset...(nxt_offset || src.length)]
157
+ obj = build_object(obj_src)
158
+ catalogue.store(obj.oid, obj)
159
+ end
160
+ catalogue
161
+ end
162
+ def page_tree_root
163
+ catalogue = object_catalogue
164
+ trailer = trailer_dictionary
165
+ rebuild_object_catalogue
166
+ catalogue[trailer.root_id]
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,408 @@
1
+ #!/usr/bin/env ruby
2
+ # Symbol -- Rpdf2txt -- 27.04.2007 -- hwyss@ywesee.com
3
+
4
+ module Rpdf2txt
5
+ module Symbol
6
+ NAMES = {
7
+ "Alpha" => 0101, # Α
8
+ "Beta" => 0102, # Β
9
+ "Chi" => 0103, # Χ
10
+ "Delta" => 0104, # Δ
11
+ "Epsilon" => 0105, # Ε
12
+ "Eta" => 0110, # Η
13
+ "Euro" => 0240, # €
14
+ "Gamma" => 0107, # Γ
15
+ "Ifraktur" => 0301, # ℑ
16
+ "Iota" => 0111, # Ι
17
+ "Kappa" => 0113, # Κ
18
+ "Lambda" => 0114, # Λ
19
+ "Mu" => 0115, # Μ
20
+ "Nu" => 0116, # Ν
21
+ "Omega" => 0127, # Ω
22
+ "Omicron" => 0117, # Ο
23
+ "Phi" => 0106, # Φ
24
+ "Pi" => 0120, # Π
25
+ "Psi" => 0131, # Ψ
26
+ "Rfraktur" => 0302, # ℜ
27
+ "Rho" => 0122, # Ρ
28
+ "Sigma" => 0123, # Σ
29
+ "Tau" => 0124, # Τ
30
+ "Theta" => 0121, # Θ
31
+ "Upsilon" => 0125, # Υ
32
+ "Upsilon1" => 0241, # ϒ
33
+ "Xi" => 0130, # Ξ
34
+ "Zeta" => 0132, # Ζ
35
+ "aleph" => 0300, # ℵ
36
+ "alpha" => 0141, # α
37
+ "ampersand" => 0046, # &
38
+ "angle" => 0320, # ∠
39
+ "angleleft" => 0341, # 〈
40
+ "angleright" => 0361, # 〉
41
+ "approxequal" => 0273, # ≈
42
+ "arrowboth" => 0253, # ↔
43
+ "arrowdblboth" => 0333, # ⇔
44
+ "arrowdbldown" => 0337, # ⇓
45
+ "arrowdblleft" => 0334, # ⇐
46
+ "arrowdblright" => 0336, # ⇒
47
+ "arrowdblup" => 0335, # ⇑
48
+ "arrowdown" => 0257, # ↓
49
+ "arrowhorizex" => 0276, # 
50
+ "arrowleft" => 0254, # ←
51
+ "arrowright" => 0256, # →
52
+ "arrowup" => 0255, # ↑
53
+ "arrowvertex" => 0275, # 
54
+ "asteriskmath" => 0052, # ∗
55
+ "bar" => 0174, # |
56
+ "beta" => 0142, # β
57
+ "braceleft" => 0173, # {
58
+ "braceright" => 0175, # }
59
+ "bracelefttp" => 0354, # 
60
+ "braceleftmid" => 0355, # 
61
+ "braceleftbt" => 0356, # 
62
+ "bracerighttp" => 0374, # 
63
+ "bracerightmid" => 0375, # 
64
+ "bracerightbt" => 0376, # 
65
+ "braceex" => 0357, # 
66
+ "bracketleft" => 0133, # [
67
+ "bracketright" => 0135, # ]
68
+ "bracketlefttp" => 0351, # 
69
+ "bracketleftex" => 0352, # 
70
+ "bracketleftbt" => 0353, # 
71
+ "bracketrighttp" => 0371, # 
72
+ "bracketrightex" => 0372, # 
73
+ "bracketrightbt" => 0373, # 
74
+ "bullet" => 0267, # •
75
+ "carriagereturn" => 0277, # ↵
76
+ "chi" => 0143, # χ
77
+ "circlemultiply" => 0304, # ⊗
78
+ "circleplus" => 0305, # ⊕
79
+ "club" => 0247, # ♣
80
+ "colon" => 0072, # :
81
+ "comma" => 0054, # ,
82
+ "congruent" => 0100, # ≅
83
+ "copyrightsans" => 0343, # 
84
+ "copyrightserif" => 0323,
85
+ "degree" => 0260, # °
86
+ "delta" => 0144, # δ
87
+ "diamond" => 0250, # ♦
88
+ "divide" => 0270, # ÷
89
+ "dotmath" => 0327, # ⋅
90
+ "eight" => 0070, # 8,
91
+ "element" => 0316, # ∈
92
+ "ellipsis" => 0274, # …
93
+ "emptyset" => 0306, # ∅
94
+ "epsilon" => 0145, # ε
95
+ "equal" => 0075, # =
96
+ "equivalence" => 0272, # ≡
97
+ "eta" => 0150, # η
98
+ "exclam" => 0041, # !
99
+ "existential" => 0044, # ∃
100
+ "five" => 0065, # 5,
101
+ "florin" => 0246, # ƒ
102
+ "four" => 0064, # 4,
103
+ "fraction" => 0244, # ⁄
104
+ "gamma" => 0147, # γ
105
+ "gradient" => 0321, # ∇
106
+ "greater" => 0076, # >
107
+ "greaterequal" => 0263, # ≥
108
+ "heart" => 0251, # ♥
109
+ "infinity" => 0245, # ∞
110
+ "integral" => 0362, # ∫
111
+ "integraltp" => 0363, # ⌠
112
+ "integralex" => 0364, # 
113
+ "integralbt" => 0365, # ⌡
114
+ "intersection" => 0307, # ∩
115
+ "iota" => 0151, # ι
116
+ "kappa" => 0153, # κ
117
+ "lambda" => 0154, # λ
118
+ "less" => 0074, # <
119
+ "lessequal" => 0243, # ≤
120
+ "logicaland" => 0331, # ∧
121
+ "logicalnot" => 0330, # ¬
122
+ "logicalor" => 0332, # ∨
123
+ "lozenge" => 0340, # ◊
124
+ "minus" => 0055, # −
125
+ "minute" => 0242, # ′
126
+ "mu" => 0155, # μ
127
+ "multiply" => 0264, # ×
128
+ "nine" => 0071, # 9,
129
+ "notelement" => 0317, # ∉
130
+ "notequal" => 0271, # ≠
131
+ "notsubset" => 0313, # ⊄
132
+ "nu" => 0156, # ν
133
+ "numbersign" => 0043, # #
134
+ "omega" => 0167, # ω
135
+ "omega1" => 0166, # ϖ
136
+ "omicron" => 0157, # ο
137
+ "one" => 0061, # 1,
138
+ "parenleft" => 0050, # (
139
+ "parenright" => 0051, # )
140
+ "parenlefttp" => 0346, # 
141
+ "parenleftex" => 0347, # 
142
+ "parenleftbt" => 0350, # 
143
+ "parenrighttp" => 0366, # 
144
+ "parenrightex" => 0367, # 
145
+ "parenrightbt" => 0370, # 
146
+ "partialdiff" => 0266, # ∂
147
+ "percent" => 0045, # %
148
+ "period" => 0056, # .
149
+ "perpendicular" => 0136, # ⊥
150
+ "phi" => 0146, # φ
151
+ "phi1" => 0152,
152
+ "pi" => 0160, # π
153
+ "plus" => 0053, # +
154
+ "plusminus" => 0261, # ±
155
+ "product" => 0325, # Π
156
+ "propersubset" => 0314, # ⊂
157
+ "propersuperset" => 0311, # ⊃
158
+ "proportional" => 0265, # ∝
159
+ "psi" => 0171, # ψ
160
+ "question" => 0077, # ?
161
+ "radical" => 0326, # √
162
+ "radicalex" => 0140,
163
+ "reflexsubset" => 0315, # ⊆
164
+ "reflexsuperset" => 0312, # ⊇
165
+ "registersans" => 0342, # 
166
+ "registerserif" => 0322,
167
+ "rho" => 0162, # ρ
168
+ "second" => 0262, # ″
169
+ "semicolon" => 0073, # ;
170
+ "seven" => 0067, # 7,
171
+ "sigma" => 0163, # σ
172
+ "sigma1" => 0126, # ς
173
+ "similar" => 0176, # ∼
174
+ "six" => 0066, # 6,
175
+ "slash" => 0057, # /
176
+ "space" => 0040,
177
+ "spade" => 0252, # ♠
178
+ "suchthat" => 0047, # ∋
179
+ "summation" => 0345, # Σ
180
+ "tau" => 0164, # τ
181
+ "therefore" => 0134, # ∴
182
+ "theta" => 0161, # θ
183
+ "theta1" => 0112,
184
+ "three" => 0063, # 3,
185
+ "trademarksans" => 0344, # 
186
+ "trademarkserif" => 0324,
187
+ "two" => 0062, # 2,
188
+ "underscore" => 0137, # _
189
+ "union" => 0310, # ∪
190
+ "universal" => 0042, # ∀
191
+ "upsilon" => 0165, # υ
192
+ "weierstrass" => 0303, # ℘
193
+ "xi" => 0170, # ξ
194
+ "zero" => 0060, # 0,
195
+ "zeta" => 0172, # ζ
196
+ }
197
+ UNICODE_MAP = { # based on http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/SYMBOL.TXT
198
+ 0x20 => "\x00\x20", # SPACE
199
+ 0x21 => "\x00\x21", # EXCLAMATION MARK
200
+ 0x22 => "\x22\x00", # FOR ALL
201
+ 0x23 => "\x00\x23", # NUMBER SIGN
202
+ 0x24 => "\x22\x03", # THERE EXISTS
203
+ 0x25 => "\x00\x25", # PERCENT SIGN
204
+ 0x26 => "\x00\x26", # AMPERSAND
205
+ 0x27 => "\x22\x0D", # SMALL CONTAINS AS MEMBER
206
+ 0x28 => "\x00\x28", # LEFT PARENTHESIS
207
+ 0x29 => "\x00\x29", # RIGHT PARENTHESIS
208
+ 0x2A => "\x22\x17", # ASTERISK OPERATOR
209
+ 0x2B => "\x00\x2B", # PLUS SIGN
210
+ 0x2C => "\x00\x2C", # COMMA
211
+ 0x2D => "\x22\x12", # MINUS SIGN
212
+ 0x2E => "\x00\x2E", # FULL STOP
213
+ 0x2F => "\x00\x2F", # SOLIDUS
214
+ 0x30 => "\x00\x30", # DIGIT ZERO
215
+ 0x31 => "\x00\x31", # DIGIT ONE
216
+ 0x32 => "\x00\x32", # DIGIT TWO
217
+ 0x33 => "\x00\x33", # DIGIT THREE
218
+ 0x34 => "\x00\x34", # DIGIT FOUR
219
+ 0x35 => "\x00\x35", # DIGIT FIVE
220
+ 0x36 => "\x00\x36", # DIGIT SIX
221
+ 0x37 => "\x00\x37", # DIGIT SEVEN
222
+ 0x38 => "\x00\x38", # DIGIT EIGHT
223
+ 0x39 => "\x00\x39", # DIGIT NINE
224
+ 0x3A => "\x00\x3A", # COLON
225
+ 0x3B => "\x00\x3B", # SEMICOLON
226
+ 0x3C => "\x00\x3C", # LESS-THAN SIGN
227
+ 0x3D => "\x00\x3D", # EQUALS SIGN
228
+ 0x3E => "\x00\x3E", # GREATER-THAN SIGN
229
+ 0x3F => "\x00\x3F", # QUESTION MARK
230
+ 0x40 => "\x22\x45", # APPROXIMATELY EQUAL TO
231
+ 0x41 => "\x03\x91", # GREEK CAPITAL LETTER ALPHA
232
+ 0x42 => "\x03\x92", # GREEK CAPITAL LETTER BETA
233
+ 0x43 => "\x03\xA7", # GREEK CAPITAL LETTER CHI
234
+ 0x44 => "\x03\x94", # GREEK CAPITAL LETTER DELTA
235
+ 0x45 => "\x03\x95", # GREEK CAPITAL LETTER EPSILON
236
+ 0x46 => "\x03\xA6", # GREEK CAPITAL LETTER PHI
237
+ 0x47 => "\x03\x93", # GREEK CAPITAL LETTER GAMMA
238
+ 0x48 => "\x03\x97", # GREEK CAPITAL LETTER ETA
239
+ 0x49 => "\x03\x99", # GREEK CAPITAL LETTER IOTA
240
+ 0x4A => "\x03\xD1", # GREEK THETA SYMBOL
241
+ 0x4B => "\x03\x9A", # GREEK CAPITAL LETTER KAPPA
242
+ 0x4C => "\x03\x9B", # GREEK CAPITAL LETTER LAMDA
243
+ 0x4D => "\x03\x9C", # GREEK CAPITAL LETTER MU
244
+ 0x4E => "\x03\x9D", # GREEK CAPITAL LETTER NU
245
+ 0x4F => "\x03\x9F", # GREEK CAPITAL LETTER OMICRON
246
+ 0x50 => "\x03\xA0", # GREEK CAPITAL LETTER PI
247
+ 0x51 => "\x03\x98", # GREEK CAPITAL LETTER THETA
248
+ 0x52 => "\x03\xA1", # GREEK CAPITAL LETTER RHO
249
+ 0x53 => "\x03\xA3", # GREEK CAPITAL LETTER SIGMA
250
+ 0x54 => "\x03\xA4", # GREEK CAPITAL LETTER TAU
251
+ 0x55 => "\x03\xA5", # GREEK CAPITAL LETTER UPSILON
252
+ 0x56 => "\x03\xC2", # GREEK SMALL LETTER FINAL SIGMA
253
+ 0x57 => "\x03\xA9", # GREEK CAPITAL LETTER OMEGA
254
+ 0x58 => "\x03\x9E", # GREEK CAPITAL LETTER XI
255
+ 0x59 => "\x03\xA8", # GREEK CAPITAL LETTER PSI
256
+ 0x5A => "\x03\x96", # GREEK CAPITAL LETTER ZETA
257
+ 0x5B => "\x00\x5B", # LEFT SQUARE BRACKET
258
+ 0x5C => "\x22\x34", # THEREFORE
259
+ 0x5D => "\x00\x5D", # RIGHT SQUARE BRACKET
260
+ 0x5E => "\x22\xA5", # UP TACK
261
+ 0x5F => "\x00\x5F", # LOW LINE
262
+ 0x60 => "\xF8\xE5", # radical extender # corporate char
263
+ 0x61 => "\x03\xB1", # GREEK SMALL LETTER ALPHA
264
+ 0x62 => "\x03\xB2", # GREEK SMALL LETTER BETA
265
+ 0x63 => "\x03\xC7", # GREEK SMALL LETTER CHI
266
+ 0x64 => "\x03\xB4", # GREEK SMALL LETTER DELTA
267
+ 0x65 => "\x03\xB5", # GREEK SMALL LETTER EPSILON
268
+ 0x66 => "\x03\xC6", # GREEK SMALL LETTER PHI
269
+ 0x67 => "\x03\xB3", # GREEK SMALL LETTER GAMMA
270
+ 0x68 => "\x03\xB7", # GREEK SMALL LETTER ETA
271
+ 0x69 => "\x03\xB9", # GREEK SMALL LETTER IOTA
272
+ 0x6A => "\x03\xD5", # GREEK PHI SYMBOL
273
+ 0x6B => "\x03\xBA", # GREEK SMALL LETTER KAPPA
274
+ 0x6C => "\x03\xBB", # GREEK SMALL LETTER LAMDA
275
+ 0x6D => "\x03\xBC", # GREEK SMALL LETTER MU
276
+ 0x6E => "\x03\xBD", # GREEK SMALL LETTER NU
277
+ 0x6F => "\x03\xBF", # GREEK SMALL LETTER OMICRON
278
+ 0x70 => "\x03\xC0", # GREEK SMALL LETTER PI
279
+ 0x71 => "\x03\xB8", # GREEK SMALL LETTER THETA
280
+ 0x72 => "\x03\xC1", # GREEK SMALL LETTER RHO
281
+ 0x73 => "\x03\xC3", # GREEK SMALL LETTER SIGMA
282
+ 0x74 => "\x03\xC4", # GREEK SMALL LETTER TAU
283
+ 0x75 => "\x03\xC5", # GREEK SMALL LETTER UPSILON
284
+ 0x76 => "\x03\xD6", # GREEK PI SYMBOL
285
+ 0x77 => "\x03\xC9", # GREEK SMALL LETTER OMEGA
286
+ 0x78 => "\x03\xBE", # GREEK SMALL LETTER XI
287
+ 0x79 => "\x03\xC8", # GREEK SMALL LETTER PSI
288
+ 0x7A => "\x03\xB6", # GREEK SMALL LETTER ZETA
289
+ 0x7B => "\x00\x7B", # LEFT CURLY BRACKET
290
+ 0x7C => "\x00\x7C", # VERTICAL LINE
291
+ 0x7D => "\x00\x7D", # RIGHT CURLY BRACKET
292
+ 0x7E => "\x22\x3C", # TILDE OPERATOR
293
+ 0xA0 => "\x20\xAC", # EURO SIGN
294
+ 0xA1 => "\x03\xD2", # GREEK UPSILON WITH HOOK SYMBOL
295
+ 0xA2 => "\x20\x32", # PRIME # minute
296
+ 0xA3 => "\x22\x64", # LESS-THAN OR EQUAL TO
297
+ 0xA4 => "\x20\x44", # FRACTION SLASH
298
+ 0xA5 => "\x22\x1E", # INFINITY
299
+ 0xA6 => "\x01\x92", # LATIN SMALL LETTER F WITH HOOK
300
+ 0xA7 => "\x26\x63", # BLACK CLUB SUIT
301
+ 0xA8 => "\x26\x66", # BLACK DIAMOND SUIT
302
+ 0xA9 => "\x26\x65", # BLACK HEART SUIT
303
+ 0xAA => "\x26\x60", # BLACK SPADE SUIT
304
+ 0xAB => "\x21\x94", # LEFT RIGHT ARROW
305
+ 0xAC => "\x21\x90", # LEFTWARDS ARROW
306
+ 0xAD => "\x21\x91", # UPWARDS ARROW
307
+ 0xAE => "\x21\x92", # RIGHTWARDS ARROW
308
+ 0xAF => "\x21\x93", # DOWNWARDS ARROW
309
+ 0xB0 => "\x00\xB0", # DEGREE SIGN
310
+ 0xB1 => "\x00\xB1", # PLUS-MINUS SIGN
311
+ 0xB2 => "\x20\x33", # DOUBLE PRIME # second
312
+ 0xB3 => "\x22\x65", # GREATER-THAN OR EQUAL TO
313
+ 0xB4 => "\x00\xD7", # MULTIPLICATION SIGN
314
+ 0xB5 => "\x22\x1D", # PROPORTIONAL TO
315
+ 0xB6 => "\x22\x02", # PARTIAL DIFFERENTIAL
316
+ 0xB7 => "\x20\x22", # BULLET
317
+ 0xB8 => "\x00\xF7", # DIVISION SIGN
318
+ 0xB9 => "\x22\x60", # NOT EQUAL TO
319
+ 0xBA => "\x22\x61", # IDENTICAL TO
320
+ 0xBB => "\x22\x48", # ALMOST EQUAL TO
321
+ 0xBC => "\x20\x26", # HORIZONTAL ELLIPSIS
322
+ 0xBD => "\x23\xD0", # VERTICAL LINE EXTENSION (for arrows) # for Unicode 4.0 and later
323
+ 0xBE => "\x23\xAF", # HORIZONTAL LINE EXTENSION (for arrows) # for Unicode 3.2 and later
324
+ 0xBF => "\x21\xB5", # DOWNWARDS ARROW WITH CORNER LEFTWARDS
325
+ 0xC0 => "\x21\x35", # ALEF SYMBOL
326
+ 0xC1 => "\x21\x11", # BLACK-LETTER CAPITAL I
327
+ 0xC2 => "\x21\x1C", # BLACK-LETTER CAPITAL R
328
+ 0xC3 => "\x21\x18", # SCRIPT CAPITAL P
329
+ 0xC4 => "\x22\x97", # CIRCLED TIMES
330
+ 0xC5 => "\x22\x95", # CIRCLED PLUS
331
+ 0xC6 => "\x22\x05", # EMPTY SET
332
+ 0xC7 => "\x22\x29", # INTERSECTION
333
+ 0xC8 => "\x22\x2A", # UNION
334
+ 0xC9 => "\x22\x83", # SUPERSET OF
335
+ 0xCA => "\x22\x87", # SUPERSET OF OR EQUAL TO
336
+ 0xCB => "\x22\x84", # NOT A SUBSET OF
337
+ 0xCC => "\x22\x82", # SUBSET OF
338
+ 0xCD => "\x22\x86", # SUBSET OF OR EQUAL TO
339
+ 0xCE => "\x22\x08", # ELEMENT OF
340
+ 0xCF => "\x22\x09", # NOT AN ELEMENT OF
341
+ 0xD0 => "\x22\x20", # ANGLE
342
+ 0xD1 => "\x22\x07", # NABLA
343
+ 0xD2 => "\x00\xAE", # REGISTERED SIGN # serif
344
+ 0xD3 => "\x00\xA9", # COPYRIGHT SIGN # serif
345
+ 0xD4 => "\x21\x22", # TRADE MARK SIGN # serif
346
+ 0xD5 => "\x22\x0F", # N-ARY PRODUCT
347
+ 0xD6 => "\x22\x1A", # SQUARE ROOT
348
+ 0xD7 => "\x22\xC5", # DOT OPERATOR
349
+ 0xD8 => "\x00\xAC", # NOT SIGN
350
+ 0xD9 => "\x22\x27", # LOGICAL AND
351
+ 0xDA => "\x22\x28", # LOGICAL OR
352
+ 0xDB => "\x21\xD4", # LEFT RIGHT DOUBLE ARROW
353
+ 0xDC => "\x21\xD0", # LEFTWARDS DOUBLE ARROW
354
+ 0xDD => "\x21\xD1", # UPWARDS DOUBLE ARROW
355
+ 0xDE => "\x21\xD2", # RIGHTWARDS DOUBLE ARROW
356
+ 0xDF => "\x21\xD3", # DOWNWARDS DOUBLE ARROW
357
+ 0xE0 => "\x25\xCA", # LOZENGE # previously mapped to 0x22C4 DIAMOND OPERATOR
358
+ 0xE1 => "\x30\x08", # LEFT ANGLE BRACKET
359
+ 0xE2 => "\x00\xAE", # REGISTERED SIGN, alternate: sans serif (0xF87F)
360
+ 0xE3 => "\x00\xA9", # COPYRIGHT SIGN, alternate: sans serif (0xF87F)
361
+ 0xE4 => "\x21\x22", # TRADE MARK SIGN, alternate: sans serif (0xF87F)
362
+ 0xE5 => "\x22\x11", # N-ARY SUMMATION
363
+ 0xE6 => "\x23\x9B", # LEFT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
364
+ 0xE7 => "\x23\x9C", # LEFT PARENTHESIS EXTENSION # for Unicode 3.2 and later
365
+ 0xE8 => "\x23\x9D", # LEFT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
366
+ 0xE9 => "\x23\xA1", # LEFT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
367
+ 0xEA => "\x23\xA2", # LEFT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
368
+ 0xEB => "\x23\xA3", # LEFT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
369
+ 0xEC => "\x23\xA7", # LEFT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
370
+ 0xED => "\x23\xA8", # LEFT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
371
+ 0xEE => "\x23\xA9", # LEFT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
372
+ 0xEF => "\x23\xAA", # CURLY BRACKET EXTENSION # for Unicode 3.2 and later
373
+ 0xF0 => "\xF8\xFF", # Apple logo
374
+ 0xF1 => "\x30\x09", # RIGHT ANGLE BRACKET
375
+ 0xF2 => "\x22\x2B", # INTEGRAL
376
+ 0xF3 => "\x23\x20", # TOP HALF INTEGRAL
377
+ 0xF4 => "\x23\xAE", # INTEGRAL EXTENSION # for Unicode 3.2 and later
378
+ 0xF5 => "\x23\x21", # BOTTOM HALF INTEGRAL
379
+ 0xF6 => "\x23\x9E", # RIGHT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
380
+ 0xF7 => "\x23\x9F", # RIGHT PARENTHESIS EXTENSION # for Unicode 3.2 and later
381
+ 0xF8 => "\x23\xA0", # RIGHT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
382
+ 0xF9 => "\x23\xA4", # RIGHT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
383
+ 0xFA => "\x23\xA5", # RIGHT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
384
+ 0xFB => "\x23\xA6", # RIGHT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
385
+ 0xFC => "\x23\xAB", # RIGHT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
386
+ 0xFD => "\x23\xAC", # RIGHT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
387
+ 0xFE => "\x23\xAD", # RIGHT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
388
+ }
389
+ SYMBOL_MAP = UNICODE_MAP.invert
390
+ def Symbol.byte(name)
391
+ NAMES[name] || SYMBOL_MAP[name]
392
+ end
393
+ def Symbol.to_utf16(txt)
394
+ res = ''
395
+ txt.each_byte { |byte|
396
+ res << UNICODE_MAP.fetch(byte, '')
397
+ }
398
+ res
399
+ end
400
+ def Symbol.from_utf16(txt)
401
+ res = ''
402
+ txt.scan(/../n) { |bb|
403
+ res << SYMBOL_MAP.fetch(bb, '')
404
+ }
405
+ res
406
+ end
407
+ end
408
+ end