rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # PdfParser -- Rpdf2txt-- 14.11.2002 -- aschrafl@ywesee.com
24
+
25
+ require 'zlib'
26
+ require 'rpdf2txt/object'
27
+ require 'rpdf2txt/default_handler'
28
+ require 'md5'
29
+
30
+ module Rpdf2txt
31
+ VERSION = '0.8.2'
32
+ class Parser
33
+ attr_accessor :encrypt
34
+ def initialize(pdf_stream, target_encoding='utf8')
35
+ @encrypt_id = nil
36
+ @src = pdf_stream
37
+ @object_catalogue = nil
38
+ @target_encoding = target_encoding
39
+ end
40
+ def extract_text(callback_handler = SimpleHandler.new)
41
+ page_tree.each { |node|
42
+ node.text(callback_handler)
43
+ callback_handler.send_page
44
+ }
45
+ callback_handler.send_eof
46
+ end
47
+ def object_catalogue
48
+ @object_catalogue ||= build_object_catalogue()
49
+ end
50
+ def page_tree
51
+ @page_tree ||= build_page_tree()
52
+ end
53
+ # helper methods
54
+ def build_trailer_dictionary
55
+ @trailer_dictionary = @object_catalogue.values.find do |obj|
56
+ obj.is_a?(TrailerDictionary)
57
+ end
58
+ startobj = 0
59
+ endobj = 0
60
+ while(endobj && (startobj = @src.index(/\btrailer/n, endobj)))
61
+ if(endobj = @src.index(/startxref/n, startobj))
62
+ endobj+= 8
63
+ trailer_src = @src[startobj..endobj]
64
+ trailer_dictionary = TrailerDictionary.new(trailer_src, @target_encoding)
65
+ if(@trailer_dictionary.nil?)
66
+ @trailer_dictionary = trailer_dictionary
67
+ else
68
+ @trailer_dictionary.update(trailer_dictionary)
69
+ end
70
+ end
71
+ end
72
+ if @trailer_dictionary.nil? \
73
+ && match = /startxref\s*(\d+)\s*%%EOF/m.match(@src)
74
+ startobj = match[1].to_i
75
+ endobj = @src.index(/endobj/n, startobj) + 6
76
+ xref_src = @src[startobj...endobj]
77
+ @trailer_dictionary = TrailerDictionary.new(xref_src, @target_encoding)
78
+ end
79
+ if (@encrypt_id = @trailer_dictionary.encrypt_id) \
80
+ && (obj = @object_catalogue[@encrypt_id])
81
+ @encrypt = PdfEncrypt.new(obj.src)
82
+ @encrypt.file_id = @trailer_dictionary.file_id
83
+ @object_catalogue.each_value do |obj|
84
+ obj.decoder = @encrypt
85
+ end
86
+ end
87
+ @trailer_dictionary
88
+ end
89
+ def trailer_dictionary
90
+ @trailer_dictionary ||= self.build_trailer_dictionary
91
+ end
92
+ private
93
+ def build_object(src)
94
+ case src
95
+ when /\/Type\s*\/Catalog\b/n
96
+ CatalogNode.new(src, @target_encoding)
97
+ when /\/Type\s*\/Pages\b/n
98
+ PageNode.new(src, @target_encoding)
99
+ when /\/Type\s*\/Page\b/n
100
+ PageLeaf.new(src, @target_encoding)
101
+ when /\/Type\s*\/Font\b/n
102
+ Font.new(src, @target_encoding)
103
+ when /\/Type\s*\/FontDescriptor\b/n
104
+ FontDescriptor.new(src, @target_encoding)
105
+ when /\/Type\s*\/Encoding\b/n
106
+ Encoding.new(src, @target_encoding)
107
+ when /\/Type\s*\/ObjStm\b/n
108
+ ObjStream.new(src, @target_encoding)
109
+ when /\/Type\s*\/XRef\b/n
110
+ TrailerDictionary.new(src, @target_encoding)
111
+ when %r!/Subtype\s*/Image!n
112
+ Image.new(src, @target_encoding)
113
+ when /\bstream\b/n, %r{/ToUnicode\b}n
114
+ Stream.new(src, @target_encoding)
115
+ when /\/Font\s*<</mn
116
+ Resource.new(src, @target_encoding)
117
+ when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s+){2}R\s*)*\]\s+endobj/mn
118
+ ReferenceArray.new(src, @target_encoding)
119
+ when /^(?:\d+\s+){2}obj\s*\[\s*(?:(\d+\s*))*\]\s+endobj/mn
120
+ PdfArray.new(src, @target_encoding)
121
+ when /obj\s*<</mn
122
+ PdfHash.new(src, @target_encoding)
123
+ else
124
+ Unknown.new(src, @target_encoding)
125
+ end
126
+ end
127
+ def build_object_catalogue
128
+ startobj=0
129
+ endobj=0
130
+ catalogue = {}
131
+ @src.scan(/(?:\d+ ){2}obj\b.*?\bendobj\b/mn) do |match|
132
+ obj = build_object(match.to_s)
133
+ catalogue.store(obj.oid, obj)
134
+ end
135
+ catalogue
136
+ end
137
+ def rebuild_object_catalogue
138
+ object_catalogue.values.select do |obj|
139
+ obj.is_a?(ObjStream)
140
+ end.each do |obj|
141
+ scan_object_stream obj.decoded_stream, object_catalogue
142
+ end
143
+ end
144
+ def build_page_tree
145
+ page_tree_root.build_tree(object_catalogue)
146
+ end
147
+ def scan_object_stream src, catalogue
148
+ match = /^(?<pairs>(\d+\s+\d+\s+?)+)(?<objects>.*)/.match src
149
+ pairs, objects = match[:pairs], match[:objects]
150
+ offsets = pairs.scan(/(\d+)\s+(\d+)/).collect do |obj_id, offset|
151
+ [obj_id.to_i, offset.to_i]
152
+ end
153
+ offsets.each_with_index do |(obj_id, offset), idx|
154
+ nxt_id, nxt_offset = offsets[idx.next]
155
+ obj_src = sprintf "%i 0 obj %s endobj", obj_id,
156
+ objects[offset...(nxt_offset || src.length)]
157
+ obj = build_object(obj_src)
158
+ catalogue.store(obj.oid, obj)
159
+ end
160
+ catalogue
161
+ end
162
+ def page_tree_root
163
+ catalogue = object_catalogue
164
+ trailer = trailer_dictionary
165
+ rebuild_object_catalogue
166
+ catalogue[trailer.root_id]
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,408 @@
1
+ #!/usr/bin/env ruby
2
+ # Symbol -- Rpdf2txt -- 27.04.2007 -- hwyss@ywesee.com
3
+
4
+ module Rpdf2txt
5
+ module Symbol
6
+ NAMES = {
7
+ "Alpha" => 0101, # Α
8
+ "Beta" => 0102, # Β
9
+ "Chi" => 0103, # Χ
10
+ "Delta" => 0104, # Δ
11
+ "Epsilon" => 0105, # Ε
12
+ "Eta" => 0110, # Η
13
+ "Euro" => 0240, # €
14
+ "Gamma" => 0107, # Γ
15
+ "Ifraktur" => 0301, # ℑ
16
+ "Iota" => 0111, # Ι
17
+ "Kappa" => 0113, # Κ
18
+ "Lambda" => 0114, # Λ
19
+ "Mu" => 0115, # Μ
20
+ "Nu" => 0116, # Ν
21
+ "Omega" => 0127, # Ω
22
+ "Omicron" => 0117, # Ο
23
+ "Phi" => 0106, # Φ
24
+ "Pi" => 0120, # Π
25
+ "Psi" => 0131, # Ψ
26
+ "Rfraktur" => 0302, # ℜ
27
+ "Rho" => 0122, # Ρ
28
+ "Sigma" => 0123, # Σ
29
+ "Tau" => 0124, # Τ
30
+ "Theta" => 0121, # Θ
31
+ "Upsilon" => 0125, # Υ
32
+ "Upsilon1" => 0241, # ϒ
33
+ "Xi" => 0130, # Ξ
34
+ "Zeta" => 0132, # Ζ
35
+ "aleph" => 0300, # ℵ
36
+ "alpha" => 0141, # α
37
+ "ampersand" => 0046, # &
38
+ "angle" => 0320, # ∠
39
+ "angleleft" => 0341, # 〈
40
+ "angleright" => 0361, # 〉
41
+ "approxequal" => 0273, # ≈
42
+ "arrowboth" => 0253, # ↔
43
+ "arrowdblboth" => 0333, # ⇔
44
+ "arrowdbldown" => 0337, # ⇓
45
+ "arrowdblleft" => 0334, # ⇐
46
+ "arrowdblright" => 0336, # ⇒
47
+ "arrowdblup" => 0335, # ⇑
48
+ "arrowdown" => 0257, # ↓
49
+ "arrowhorizex" => 0276, # 
50
+ "arrowleft" => 0254, # ←
51
+ "arrowright" => 0256, # →
52
+ "arrowup" => 0255, # ↑
53
+ "arrowvertex" => 0275, # 
54
+ "asteriskmath" => 0052, # ∗
55
+ "bar" => 0174, # |
56
+ "beta" => 0142, # β
57
+ "braceleft" => 0173, # {
58
+ "braceright" => 0175, # }
59
+ "bracelefttp" => 0354, # 
60
+ "braceleftmid" => 0355, # 
61
+ "braceleftbt" => 0356, # 
62
+ "bracerighttp" => 0374, # 
63
+ "bracerightmid" => 0375, # 
64
+ "bracerightbt" => 0376, # 
65
+ "braceex" => 0357, # 
66
+ "bracketleft" => 0133, # [
67
+ "bracketright" => 0135, # ]
68
+ "bracketlefttp" => 0351, # 
69
+ "bracketleftex" => 0352, # 
70
+ "bracketleftbt" => 0353, # 
71
+ "bracketrighttp" => 0371, # 
72
+ "bracketrightex" => 0372, # 
73
+ "bracketrightbt" => 0373, # 
74
+ "bullet" => 0267, # •
75
+ "carriagereturn" => 0277, # ↵
76
+ "chi" => 0143, # χ
77
+ "circlemultiply" => 0304, # ⊗
78
+ "circleplus" => 0305, # ⊕
79
+ "club" => 0247, # ♣
80
+ "colon" => 0072, # :
81
+ "comma" => 0054, # ,
82
+ "congruent" => 0100, # ≅
83
+ "copyrightsans" => 0343, # 
84
+ "copyrightserif" => 0323,
85
+ "degree" => 0260, # °
86
+ "delta" => 0144, # δ
87
+ "diamond" => 0250, # ♦
88
+ "divide" => 0270, # ÷
89
+ "dotmath" => 0327, # ⋅
90
+ "eight" => 0070, # 8,
91
+ "element" => 0316, # ∈
92
+ "ellipsis" => 0274, # …
93
+ "emptyset" => 0306, # ∅
94
+ "epsilon" => 0145, # ε
95
+ "equal" => 0075, # =
96
+ "equivalence" => 0272, # ≡
97
+ "eta" => 0150, # η
98
+ "exclam" => 0041, # !
99
+ "existential" => 0044, # ∃
100
+ "five" => 0065, # 5,
101
+ "florin" => 0246, # ƒ
102
+ "four" => 0064, # 4,
103
+ "fraction" => 0244, # ⁄
104
+ "gamma" => 0147, # γ
105
+ "gradient" => 0321, # ∇
106
+ "greater" => 0076, # >
107
+ "greaterequal" => 0263, # ≥
108
+ "heart" => 0251, # ♥
109
+ "infinity" => 0245, # ∞
110
+ "integral" => 0362, # ∫
111
+ "integraltp" => 0363, # ⌠
112
+ "integralex" => 0364, # 
113
+ "integralbt" => 0365, # ⌡
114
+ "intersection" => 0307, # ∩
115
+ "iota" => 0151, # ι
116
+ "kappa" => 0153, # κ
117
+ "lambda" => 0154, # λ
118
+ "less" => 0074, # <
119
+ "lessequal" => 0243, # ≤
120
+ "logicaland" => 0331, # ∧
121
+ "logicalnot" => 0330, # ¬
122
+ "logicalor" => 0332, # ∨
123
+ "lozenge" => 0340, # ◊
124
+ "minus" => 0055, # −
125
+ "minute" => 0242, # ′
126
+ "mu" => 0155, # μ
127
+ "multiply" => 0264, # ×
128
+ "nine" => 0071, # 9,
129
+ "notelement" => 0317, # ∉
130
+ "notequal" => 0271, # ≠
131
+ "notsubset" => 0313, # ⊄
132
+ "nu" => 0156, # ν
133
+ "numbersign" => 0043, # #
134
+ "omega" => 0167, # ω
135
+ "omega1" => 0166, # ϖ
136
+ "omicron" => 0157, # ο
137
+ "one" => 0061, # 1,
138
+ "parenleft" => 0050, # (
139
+ "parenright" => 0051, # )
140
+ "parenlefttp" => 0346, # 
141
+ "parenleftex" => 0347, # 
142
+ "parenleftbt" => 0350, # 
143
+ "parenrighttp" => 0366, # 
144
+ "parenrightex" => 0367, # 
145
+ "parenrightbt" => 0370, # 
146
+ "partialdiff" => 0266, # ∂
147
+ "percent" => 0045, # %
148
+ "period" => 0056, # .
149
+ "perpendicular" => 0136, # ⊥
150
+ "phi" => 0146, # φ
151
+ "phi1" => 0152,
152
+ "pi" => 0160, # π
153
+ "plus" => 0053, # +
154
+ "plusminus" => 0261, # ±
155
+ "product" => 0325, # Π
156
+ "propersubset" => 0314, # ⊂
157
+ "propersuperset" => 0311, # ⊃
158
+ "proportional" => 0265, # ∝
159
+ "psi" => 0171, # ψ
160
+ "question" => 0077, # ?
161
+ "radical" => 0326, # √
162
+ "radicalex" => 0140,
163
+ "reflexsubset" => 0315, # ⊆
164
+ "reflexsuperset" => 0312, # ⊇
165
+ "registersans" => 0342, # 
166
+ "registerserif" => 0322,
167
+ "rho" => 0162, # ρ
168
+ "second" => 0262, # ″
169
+ "semicolon" => 0073, # ;
170
+ "seven" => 0067, # 7,
171
+ "sigma" => 0163, # σ
172
+ "sigma1" => 0126, # ς
173
+ "similar" => 0176, # ∼
174
+ "six" => 0066, # 6,
175
+ "slash" => 0057, # /
176
+ "space" => 0040,
177
+ "spade" => 0252, # ♠
178
+ "suchthat" => 0047, # ∋
179
+ "summation" => 0345, # Σ
180
+ "tau" => 0164, # τ
181
+ "therefore" => 0134, # ∴
182
+ "theta" => 0161, # θ
183
+ "theta1" => 0112,
184
+ "three" => 0063, # 3,
185
+ "trademarksans" => 0344, # 
186
+ "trademarkserif" => 0324,
187
+ "two" => 0062, # 2,
188
+ "underscore" => 0137, # _
189
+ "union" => 0310, # ∪
190
+ "universal" => 0042, # ∀
191
+ "upsilon" => 0165, # υ
192
+ "weierstrass" => 0303, # ℘
193
+ "xi" => 0170, # ξ
194
+ "zero" => 0060, # 0,
195
+ "zeta" => 0172, # ζ
196
+ }
197
+ UNICODE_MAP = { # based on http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/SYMBOL.TXT
198
+ 0x20 => "\x00\x20", # SPACE
199
+ 0x21 => "\x00\x21", # EXCLAMATION MARK
200
+ 0x22 => "\x22\x00", # FOR ALL
201
+ 0x23 => "\x00\x23", # NUMBER SIGN
202
+ 0x24 => "\x22\x03", # THERE EXISTS
203
+ 0x25 => "\x00\x25", # PERCENT SIGN
204
+ 0x26 => "\x00\x26", # AMPERSAND
205
+ 0x27 => "\x22\x0D", # SMALL CONTAINS AS MEMBER
206
+ 0x28 => "\x00\x28", # LEFT PARENTHESIS
207
+ 0x29 => "\x00\x29", # RIGHT PARENTHESIS
208
+ 0x2A => "\x22\x17", # ASTERISK OPERATOR
209
+ 0x2B => "\x00\x2B", # PLUS SIGN
210
+ 0x2C => "\x00\x2C", # COMMA
211
+ 0x2D => "\x22\x12", # MINUS SIGN
212
+ 0x2E => "\x00\x2E", # FULL STOP
213
+ 0x2F => "\x00\x2F", # SOLIDUS
214
+ 0x30 => "\x00\x30", # DIGIT ZERO
215
+ 0x31 => "\x00\x31", # DIGIT ONE
216
+ 0x32 => "\x00\x32", # DIGIT TWO
217
+ 0x33 => "\x00\x33", # DIGIT THREE
218
+ 0x34 => "\x00\x34", # DIGIT FOUR
219
+ 0x35 => "\x00\x35", # DIGIT FIVE
220
+ 0x36 => "\x00\x36", # DIGIT SIX
221
+ 0x37 => "\x00\x37", # DIGIT SEVEN
222
+ 0x38 => "\x00\x38", # DIGIT EIGHT
223
+ 0x39 => "\x00\x39", # DIGIT NINE
224
+ 0x3A => "\x00\x3A", # COLON
225
+ 0x3B => "\x00\x3B", # SEMICOLON
226
+ 0x3C => "\x00\x3C", # LESS-THAN SIGN
227
+ 0x3D => "\x00\x3D", # EQUALS SIGN
228
+ 0x3E => "\x00\x3E", # GREATER-THAN SIGN
229
+ 0x3F => "\x00\x3F", # QUESTION MARK
230
+ 0x40 => "\x22\x45", # APPROXIMATELY EQUAL TO
231
+ 0x41 => "\x03\x91", # GREEK CAPITAL LETTER ALPHA
232
+ 0x42 => "\x03\x92", # GREEK CAPITAL LETTER BETA
233
+ 0x43 => "\x03\xA7", # GREEK CAPITAL LETTER CHI
234
+ 0x44 => "\x03\x94", # GREEK CAPITAL LETTER DELTA
235
+ 0x45 => "\x03\x95", # GREEK CAPITAL LETTER EPSILON
236
+ 0x46 => "\x03\xA6", # GREEK CAPITAL LETTER PHI
237
+ 0x47 => "\x03\x93", # GREEK CAPITAL LETTER GAMMA
238
+ 0x48 => "\x03\x97", # GREEK CAPITAL LETTER ETA
239
+ 0x49 => "\x03\x99", # GREEK CAPITAL LETTER IOTA
240
+ 0x4A => "\x03\xD1", # GREEK THETA SYMBOL
241
+ 0x4B => "\x03\x9A", # GREEK CAPITAL LETTER KAPPA
242
+ 0x4C => "\x03\x9B", # GREEK CAPITAL LETTER LAMDA
243
+ 0x4D => "\x03\x9C", # GREEK CAPITAL LETTER MU
244
+ 0x4E => "\x03\x9D", # GREEK CAPITAL LETTER NU
245
+ 0x4F => "\x03\x9F", # GREEK CAPITAL LETTER OMICRON
246
+ 0x50 => "\x03\xA0", # GREEK CAPITAL LETTER PI
247
+ 0x51 => "\x03\x98", # GREEK CAPITAL LETTER THETA
248
+ 0x52 => "\x03\xA1", # GREEK CAPITAL LETTER RHO
249
+ 0x53 => "\x03\xA3", # GREEK CAPITAL LETTER SIGMA
250
+ 0x54 => "\x03\xA4", # GREEK CAPITAL LETTER TAU
251
+ 0x55 => "\x03\xA5", # GREEK CAPITAL LETTER UPSILON
252
+ 0x56 => "\x03\xC2", # GREEK SMALL LETTER FINAL SIGMA
253
+ 0x57 => "\x03\xA9", # GREEK CAPITAL LETTER OMEGA
254
+ 0x58 => "\x03\x9E", # GREEK CAPITAL LETTER XI
255
+ 0x59 => "\x03\xA8", # GREEK CAPITAL LETTER PSI
256
+ 0x5A => "\x03\x96", # GREEK CAPITAL LETTER ZETA
257
+ 0x5B => "\x00\x5B", # LEFT SQUARE BRACKET
258
+ 0x5C => "\x22\x34", # THEREFORE
259
+ 0x5D => "\x00\x5D", # RIGHT SQUARE BRACKET
260
+ 0x5E => "\x22\xA5", # UP TACK
261
+ 0x5F => "\x00\x5F", # LOW LINE
262
+ 0x60 => "\xF8\xE5", # radical extender # corporate char
263
+ 0x61 => "\x03\xB1", # GREEK SMALL LETTER ALPHA
264
+ 0x62 => "\x03\xB2", # GREEK SMALL LETTER BETA
265
+ 0x63 => "\x03\xC7", # GREEK SMALL LETTER CHI
266
+ 0x64 => "\x03\xB4", # GREEK SMALL LETTER DELTA
267
+ 0x65 => "\x03\xB5", # GREEK SMALL LETTER EPSILON
268
+ 0x66 => "\x03\xC6", # GREEK SMALL LETTER PHI
269
+ 0x67 => "\x03\xB3", # GREEK SMALL LETTER GAMMA
270
+ 0x68 => "\x03\xB7", # GREEK SMALL LETTER ETA
271
+ 0x69 => "\x03\xB9", # GREEK SMALL LETTER IOTA
272
+ 0x6A => "\x03\xD5", # GREEK PHI SYMBOL
273
+ 0x6B => "\x03\xBA", # GREEK SMALL LETTER KAPPA
274
+ 0x6C => "\x03\xBB", # GREEK SMALL LETTER LAMDA
275
+ 0x6D => "\x03\xBC", # GREEK SMALL LETTER MU
276
+ 0x6E => "\x03\xBD", # GREEK SMALL LETTER NU
277
+ 0x6F => "\x03\xBF", # GREEK SMALL LETTER OMICRON
278
+ 0x70 => "\x03\xC0", # GREEK SMALL LETTER PI
279
+ 0x71 => "\x03\xB8", # GREEK SMALL LETTER THETA
280
+ 0x72 => "\x03\xC1", # GREEK SMALL LETTER RHO
281
+ 0x73 => "\x03\xC3", # GREEK SMALL LETTER SIGMA
282
+ 0x74 => "\x03\xC4", # GREEK SMALL LETTER TAU
283
+ 0x75 => "\x03\xC5", # GREEK SMALL LETTER UPSILON
284
+ 0x76 => "\x03\xD6", # GREEK PI SYMBOL
285
+ 0x77 => "\x03\xC9", # GREEK SMALL LETTER OMEGA
286
+ 0x78 => "\x03\xBE", # GREEK SMALL LETTER XI
287
+ 0x79 => "\x03\xC8", # GREEK SMALL LETTER PSI
288
+ 0x7A => "\x03\xB6", # GREEK SMALL LETTER ZETA
289
+ 0x7B => "\x00\x7B", # LEFT CURLY BRACKET
290
+ 0x7C => "\x00\x7C", # VERTICAL LINE
291
+ 0x7D => "\x00\x7D", # RIGHT CURLY BRACKET
292
+ 0x7E => "\x22\x3C", # TILDE OPERATOR
293
+ 0xA0 => "\x20\xAC", # EURO SIGN
294
+ 0xA1 => "\x03\xD2", # GREEK UPSILON WITH HOOK SYMBOL
295
+ 0xA2 => "\x20\x32", # PRIME # minute
296
+ 0xA3 => "\x22\x64", # LESS-THAN OR EQUAL TO
297
+ 0xA4 => "\x20\x44", # FRACTION SLASH
298
+ 0xA5 => "\x22\x1E", # INFINITY
299
+ 0xA6 => "\x01\x92", # LATIN SMALL LETTER F WITH HOOK
300
+ 0xA7 => "\x26\x63", # BLACK CLUB SUIT
301
+ 0xA8 => "\x26\x66", # BLACK DIAMOND SUIT
302
+ 0xA9 => "\x26\x65", # BLACK HEART SUIT
303
+ 0xAA => "\x26\x60", # BLACK SPADE SUIT
304
+ 0xAB => "\x21\x94", # LEFT RIGHT ARROW
305
+ 0xAC => "\x21\x90", # LEFTWARDS ARROW
306
+ 0xAD => "\x21\x91", # UPWARDS ARROW
307
+ 0xAE => "\x21\x92", # RIGHTWARDS ARROW
308
+ 0xAF => "\x21\x93", # DOWNWARDS ARROW
309
+ 0xB0 => "\x00\xB0", # DEGREE SIGN
310
+ 0xB1 => "\x00\xB1", # PLUS-MINUS SIGN
311
+ 0xB2 => "\x20\x33", # DOUBLE PRIME # second
312
+ 0xB3 => "\x22\x65", # GREATER-THAN OR EQUAL TO
313
+ 0xB4 => "\x00\xD7", # MULTIPLICATION SIGN
314
+ 0xB5 => "\x22\x1D", # PROPORTIONAL TO
315
+ 0xB6 => "\x22\x02", # PARTIAL DIFFERENTIAL
316
+ 0xB7 => "\x20\x22", # BULLET
317
+ 0xB8 => "\x00\xF7", # DIVISION SIGN
318
+ 0xB9 => "\x22\x60", # NOT EQUAL TO
319
+ 0xBA => "\x22\x61", # IDENTICAL TO
320
+ 0xBB => "\x22\x48", # ALMOST EQUAL TO
321
+ 0xBC => "\x20\x26", # HORIZONTAL ELLIPSIS
322
+ 0xBD => "\x23\xD0", # VERTICAL LINE EXTENSION (for arrows) # for Unicode 4.0 and later
323
+ 0xBE => "\x23\xAF", # HORIZONTAL LINE EXTENSION (for arrows) # for Unicode 3.2 and later
324
+ 0xBF => "\x21\xB5", # DOWNWARDS ARROW WITH CORNER LEFTWARDS
325
+ 0xC0 => "\x21\x35", # ALEF SYMBOL
326
+ 0xC1 => "\x21\x11", # BLACK-LETTER CAPITAL I
327
+ 0xC2 => "\x21\x1C", # BLACK-LETTER CAPITAL R
328
+ 0xC3 => "\x21\x18", # SCRIPT CAPITAL P
329
+ 0xC4 => "\x22\x97", # CIRCLED TIMES
330
+ 0xC5 => "\x22\x95", # CIRCLED PLUS
331
+ 0xC6 => "\x22\x05", # EMPTY SET
332
+ 0xC7 => "\x22\x29", # INTERSECTION
333
+ 0xC8 => "\x22\x2A", # UNION
334
+ 0xC9 => "\x22\x83", # SUPERSET OF
335
+ 0xCA => "\x22\x87", # SUPERSET OF OR EQUAL TO
336
+ 0xCB => "\x22\x84", # NOT A SUBSET OF
337
+ 0xCC => "\x22\x82", # SUBSET OF
338
+ 0xCD => "\x22\x86", # SUBSET OF OR EQUAL TO
339
+ 0xCE => "\x22\x08", # ELEMENT OF
340
+ 0xCF => "\x22\x09", # NOT AN ELEMENT OF
341
+ 0xD0 => "\x22\x20", # ANGLE
342
+ 0xD1 => "\x22\x07", # NABLA
343
+ 0xD2 => "\x00\xAE", # REGISTERED SIGN # serif
344
+ 0xD3 => "\x00\xA9", # COPYRIGHT SIGN # serif
345
+ 0xD4 => "\x21\x22", # TRADE MARK SIGN # serif
346
+ 0xD5 => "\x22\x0F", # N-ARY PRODUCT
347
+ 0xD6 => "\x22\x1A", # SQUARE ROOT
348
+ 0xD7 => "\x22\xC5", # DOT OPERATOR
349
+ 0xD8 => "\x00\xAC", # NOT SIGN
350
+ 0xD9 => "\x22\x27", # LOGICAL AND
351
+ 0xDA => "\x22\x28", # LOGICAL OR
352
+ 0xDB => "\x21\xD4", # LEFT RIGHT DOUBLE ARROW
353
+ 0xDC => "\x21\xD0", # LEFTWARDS DOUBLE ARROW
354
+ 0xDD => "\x21\xD1", # UPWARDS DOUBLE ARROW
355
+ 0xDE => "\x21\xD2", # RIGHTWARDS DOUBLE ARROW
356
+ 0xDF => "\x21\xD3", # DOWNWARDS DOUBLE ARROW
357
+ 0xE0 => "\x25\xCA", # LOZENGE # previously mapped to 0x22C4 DIAMOND OPERATOR
358
+ 0xE1 => "\x30\x08", # LEFT ANGLE BRACKET
359
+ 0xE2 => "\x00\xAE", # REGISTERED SIGN, alternate: sans serif (0xF87F)
360
+ 0xE3 => "\x00\xA9", # COPYRIGHT SIGN, alternate: sans serif (0xF87F)
361
+ 0xE4 => "\x21\x22", # TRADE MARK SIGN, alternate: sans serif (0xF87F)
362
+ 0xE5 => "\x22\x11", # N-ARY SUMMATION
363
+ 0xE6 => "\x23\x9B", # LEFT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
364
+ 0xE7 => "\x23\x9C", # LEFT PARENTHESIS EXTENSION # for Unicode 3.2 and later
365
+ 0xE8 => "\x23\x9D", # LEFT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
366
+ 0xE9 => "\x23\xA1", # LEFT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
367
+ 0xEA => "\x23\xA2", # LEFT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
368
+ 0xEB => "\x23\xA3", # LEFT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
369
+ 0xEC => "\x23\xA7", # LEFT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
370
+ 0xED => "\x23\xA8", # LEFT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
371
+ 0xEE => "\x23\xA9", # LEFT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
372
+ 0xEF => "\x23\xAA", # CURLY BRACKET EXTENSION # for Unicode 3.2 and later
373
+ 0xF0 => "\xF8\xFF", # Apple logo
374
+ 0xF1 => "\x30\x09", # RIGHT ANGLE BRACKET
375
+ 0xF2 => "\x22\x2B", # INTEGRAL
376
+ 0xF3 => "\x23\x20", # TOP HALF INTEGRAL
377
+ 0xF4 => "\x23\xAE", # INTEGRAL EXTENSION # for Unicode 3.2 and later
378
+ 0xF5 => "\x23\x21", # BOTTOM HALF INTEGRAL
379
+ 0xF6 => "\x23\x9E", # RIGHT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
380
+ 0xF7 => "\x23\x9F", # RIGHT PARENTHESIS EXTENSION # for Unicode 3.2 and later
381
+ 0xF8 => "\x23\xA0", # RIGHT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
382
+ 0xF9 => "\x23\xA4", # RIGHT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
383
+ 0xFA => "\x23\xA5", # RIGHT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
384
+ 0xFB => "\x23\xA6", # RIGHT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
385
+ 0xFC => "\x23\xAB", # RIGHT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
386
+ 0xFD => "\x23\xAC", # RIGHT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
387
+ 0xFE => "\x23\xAD", # RIGHT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
388
+ }
389
+ SYMBOL_MAP = UNICODE_MAP.invert
390
+ def Symbol.byte(name)
391
+ NAMES[name] || SYMBOL_MAP[name]
392
+ end
393
+ def Symbol.to_utf16(txt)
394
+ res = ''
395
+ txt.each_byte { |byte|
396
+ res << UNICODE_MAP.fetch(byte, '')
397
+ }
398
+ res
399
+ end
400
+ def Symbol.from_utf16(txt)
401
+ res = ''
402
+ txt.scan(/../n) { |bb|
403
+ res << SYMBOL_MAP.fetch(bb, '')
404
+ }
405
+ res
406
+ end
407
+ end
408
+ end