langa 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/COPYING +674 -0
  2. data/README +69 -0
  3. data/bin/langa +169 -0
  4. data/examples/afrikaans_1953_utf8.txt +1000 -0
  5. data/examples/albanian_utf8.txt +1000 -0
  6. data/examples/amharic_utf8.txt +1000 -0
  7. data/examples/arabic_svd_utf8.txt +1000 -0
  8. data/examples/armenian_western_1853_utf8.txt +1000 -0
  9. data/examples/asv_utf8.txt +1000 -0
  10. data/examples/basque_1571_utf8.txt +1000 -0
  11. data/examples/breton_utf8.txt +1000 -0
  12. data/examples/chinese_ncv_s_utf8.txt +1000 -0
  13. data/examples/chinese_ncv_utf8.txt +1000 -0
  14. data/examples/chinese_union_s_utf8.txt +1000 -0
  15. data/examples/chinese_union_utf8.txt +1000 -0
  16. data/examples/coptic_nt_utf8.txt +1000 -0
  17. data/examples/croatian_utf8.txt +1000 -0
  18. data/examples/czech_bkr_utf8.txt +1000 -0
  19. data/examples/danish_utf8.txt +1000 -0
  20. data/examples/dutch_svv_utf8.txt +1000 -0
  21. data/examples/esperanto_utf8.txt +1000 -0
  22. data/examples/estonian_utf8.txt +1000 -0
  23. data/examples/finnish_pr_1992_utf8.txt +1000 -0
  24. data/examples/french_ostervald_1996_utf8.txt +1000 -0
  25. data/examples/german_schlachter_1951_utf8.txt +1000 -0
  26. data/examples/greek_byzantine_2000_utf8.txt +1000 -0
  27. data/examples/greek_modern_utf8.txt +1000 -0
  28. data/examples/hebrew_modern_utf8.txt +1000 -0
  29. data/examples/hungarian_karoli_utf8.txt +1000 -0
  30. data/examples/italian_riveduta_1927_utf8.txt +1000 -0
  31. data/examples/kabyle_nt_utf8.txt +1000 -0
  32. data/examples/kjv_apocrypha_utf8.txt +1000 -0
  33. data/examples/korean_utf8.txt +1000 -0
  34. data/examples/latin_vulgata_clementina_utf8.txt +1000 -0
  35. data/examples/latvian_nt_utf8.txt +1000 -0
  36. data/examples/lithuanian_utf8.txt +1000 -0
  37. data/examples/manx_gaelic_utf8.txt +1000 -0
  38. data/examples/maori_utf8.txt +1000 -0
  39. data/examples/myanmar_judson_1835_utf8.txt +1000 -0
  40. data/examples/norwegian_utf8.txt +1000 -0
  41. data/examples/peshitta_utf8.txt +1000 -0
  42. data/examples/portuguese_utf8.txt +1000 -0
  43. data/examples/romani_utf8.txt +1000 -0
  44. data/examples/romanian_cornilescu_utf8.txt +1000 -0
  45. data/examples/russian_makarij_utf8.txt +1000 -0
  46. data/examples/spanish_reina_valera_1909_utf8.txt +1000 -0
  47. data/examples/swedish_1917_utf8.txt +1000 -0
  48. data/examples/tagalog_1905_utf8.txt +1000 -0
  49. data/examples/thai_kjv_utf8.txt +1000 -0
  50. data/examples/turkish_nt_utf8.txt +1000 -0
  51. data/examples/turkish_utf8.txt +1000 -0
  52. data/examples/ukrainian_1871_utf8.txt +1000 -0
  53. data/examples/vietnamese_1934_utf8.txt +1000 -0
  54. data/examples/wolof_utf8.txt +1000 -0
  55. data/examples/xhosa_utf8.txt +1000 -0
  56. data/lib/langa.rb +35 -0
  57. data/lib/langa/dna.rb +209 -0
  58. data/lib/langa/file.rb +97 -0
  59. data/lib/langa/langa.dna +406 -0
  60. data/lib/langa/languageanalyzer.rb +134 -0
  61. data/lib/langa/languages.rb +147 -0
  62. data/lib/langa/randomtestfiles.rb +140 -0
  63. data/lib/langa/utilities.rb +53 -0
  64. data/test/tc_file.rb +47 -0
  65. data/test/tc_languages.rb +69 -0
  66. data/test/tc_utilities.rb +42 -0
  67. data/unicode/CaseFolding.txt +1065 -0
  68. data/unicode/CaseFolding.txt.webloc +8 -0
  69. data/unicode/Index of -Public-MAPPINGS.webloc b/data/unicode/Index of → -Public-MAPPINGS.webloc +0 -0
  70. data/unicode/mappings/8859-1.TXT +303 -0
  71. data/unicode/mappings/8859-10.TXT +303 -0
  72. data/unicode/mappings/8859-11.TXT +297 -0
  73. data/unicode/mappings/8859-13.TXT +299 -0
  74. data/unicode/mappings/8859-14.TXT +301 -0
  75. data/unicode/mappings/8859-15.TXT +303 -0
  76. data/unicode/mappings/8859-16.TXT +299 -0
  77. data/unicode/mappings/8859-2.TXT +303 -0
  78. data/unicode/mappings/8859-3.TXT +296 -0
  79. data/unicode/mappings/8859-4.TXT +303 -0
  80. data/unicode/mappings/8859-5.TXT +303 -0
  81. data/unicode/mappings/8859-6.TXT +260 -0
  82. data/unicode/mappings/8859-7.TXT +308 -0
  83. data/unicode/mappings/8859-8.TXT +270 -0
  84. data/unicode/mappings/8859-9.TXT +307 -0
  85. data/unicode/mappings/ATARIST.TXT +313 -0
  86. data/unicode/mappings/CP037.TXT +275 -0
  87. data/unicode/mappings/CP1006.TXT +302 -0
  88. data/unicode/mappings/CP1026.TXT +275 -0
  89. data/unicode/mappings/CP1250.TXT +274 -0
  90. data/unicode/mappings/CP1251.TXT +274 -0
  91. data/unicode/mappings/CP1252.TXT +274 -0
  92. data/unicode/mappings/CP1253.TXT +274 -0
  93. data/unicode/mappings/CP1254.TXT +274 -0
  94. data/unicode/mappings/CP1255.TXT +274 -0
  95. data/unicode/mappings/CP1256.TXT +274 -0
  96. data/unicode/mappings/CP1257.TXT +274 -0
  97. data/unicode/mappings/CP1258.TXT +274 -0
  98. data/unicode/mappings/CP424.TXT +304 -0
  99. data/unicode/mappings/CP437.TXT +274 -0
  100. data/unicode/mappings/CP500.TXT +275 -0
  101. data/unicode/mappings/CP737.TXT +274 -0
  102. data/unicode/mappings/CP775.TXT +275 -0
  103. data/unicode/mappings/CP850.TXT +274 -0
  104. data/unicode/mappings/CP852.TXT +274 -0
  105. data/unicode/mappings/CP855.TXT +275 -0
  106. data/unicode/mappings/CP856.TXT +303 -0
  107. data/unicode/mappings/CP857.TXT +275 -0
  108. data/unicode/mappings/CP860.TXT +275 -0
  109. data/unicode/mappings/CP861.TXT +275 -0
  110. data/unicode/mappings/CP862.TXT +275 -0
  111. data/unicode/mappings/CP863.TXT +275 -0
  112. data/unicode/mappings/CP864.TXT +275 -0
  113. data/unicode/mappings/CP865.TXT +275 -0
  114. data/unicode/mappings/CP866.TXT +275 -0
  115. data/unicode/mappings/CP869.TXT +275 -0
  116. data/unicode/mappings/CP874.TXT +274 -0
  117. data/unicode/mappings/CP875.TXT +275 -0
  118. data/unicode/mappings/KOI8-R.TXT +302 -0
  119. data/unicode/mappings/NEXTSTEP.TXT +173 -0
  120. data/unicode/mappings/ROMAN.TXT +275 -0
  121. data/unicode/mappings/US-ASCII-QUOTES.TXT +198 -0
  122. metadata +180 -0
@@ -0,0 +1,275 @@
1
+ #
2
+ # Name: cp10000_MacRoman to Unicode table
3
+ # Unicode version: 2.0
4
+ # Table version: 2.00
5
+ # Table format: Format A
6
+ # Date: 04/24/96
7
+ # Contact: Shawn.Steele@microsoft.com
8
+ #
9
+ # General notes: none
10
+ #
11
+ # Format: Three tab-separated columns
12
+ # Column #1 is the cp10000_MacRoman code (in hex)
13
+ # Column #2 is the Unicode (in hex as 0xXXXX)
14
+ # Column #3 is the Unicode name (follows a comment sign, '#')
15
+ #
16
+ # The entries are in cp10000_MacRoman order
17
+ #
18
+ 0x00 0x0000 #NULL
19
+ 0x01 0x0001 #START OF HEADING
20
+ 0x02 0x0002 #START OF TEXT
21
+ 0x03 0x0003 #END OF TEXT
22
+ 0x04 0x0004 #END OF TRANSMISSION
23
+ 0x05 0x0005 #ENQUIRY
24
+ 0x06 0x0006 #ACKNOWLEDGE
25
+ 0x07 0x0007 #BELL
26
+ 0x08 0x0008 #BACKSPACE
27
+ 0x09 0x0009 #HORIZONTAL TABULATION
28
+ 0x0A 0x000A #LINE FEED
29
+ 0x0B 0x000B #VERTICAL TABULATION
30
+ 0x0C 0x000C #FORM FEED
31
+ 0x0D 0x000D #CARRIAGE RETURN
32
+ 0x0E 0x000E #SHIFT OUT
33
+ 0x0F 0x000F #SHIFT IN
34
+ 0x10 0x0010 #DATA LINK ESCAPE
35
+ 0x11 0x0011 #DEVICE CONTROL ONE
36
+ 0x12 0x0012 #DEVICE CONTROL TWO
37
+ 0x13 0x0013 #DEVICE CONTROL THREE
38
+ 0x14 0x0014 #DEVICE CONTROL FOUR
39
+ 0x15 0x0015 #NEGATIVE ACKNOWLEDGE
40
+ 0x16 0x0016 #SYNCHRONOUS IDLE
41
+ 0x17 0x0017 #END OF TRANSMISSION BLOCK
42
+ 0x18 0x0018 #CANCEL
43
+ 0x19 0x0019 #END OF MEDIUM
44
+ 0x1A 0x001A #SUBSTITUTE
45
+ 0x1B 0x001B #ESCAPE
46
+ 0x1C 0x001C #FILE SEPARATOR
47
+ 0x1D 0x001D #GROUP SEPARATOR
48
+ 0x1E 0x001E #RECORD SEPARATOR
49
+ 0x1F 0x001F #UNIT SEPARATOR
50
+ 0x20 0x0020 #SPACE
51
+ 0x21 0x0021 #EXCLAMATION MARK
52
+ 0x22 0x0022 #QUOTATION MARK
53
+ 0x23 0x0023 #NUMBER SIGN
54
+ 0x24 0x0024 #DOLLAR SIGN
55
+ 0x25 0x0025 #PERCENT SIGN
56
+ 0x26 0x0026 #AMPERSAND
57
+ 0x27 0x0027 #APOSTROPHE
58
+ 0x28 0x0028 #LEFT PARENTHESIS
59
+ 0x29 0x0029 #RIGHT PARENTHESIS
60
+ 0x2A 0x002A #ASTERISK
61
+ 0x2B 0x002B #PLUS SIGN
62
+ 0x2C 0x002C #COMMA
63
+ 0x2D 0x002D #HYPHEN-MINUS
64
+ 0x2E 0x002E #FULL STOP
65
+ 0x2F 0x002F #SOLIDUS
66
+ 0x30 0x0030 #DIGIT ZERO
67
+ 0x31 0x0031 #DIGIT ONE
68
+ 0x32 0x0032 #DIGIT TWO
69
+ 0x33 0x0033 #DIGIT THREE
70
+ 0x34 0x0034 #DIGIT FOUR
71
+ 0x35 0x0035 #DIGIT FIVE
72
+ 0x36 0x0036 #DIGIT SIX
73
+ 0x37 0x0037 #DIGIT SEVEN
74
+ 0x38 0x0038 #DIGIT EIGHT
75
+ 0x39 0x0039 #DIGIT NINE
76
+ 0x3A 0x003A #COLON
77
+ 0x3B 0x003B #SEMICOLON
78
+ 0x3C 0x003C #LESS-THAN SIGN
79
+ 0x3D 0x003D #EQUALS SIGN
80
+ 0x3E 0x003E #GREATER-THAN SIGN
81
+ 0x3F 0x003F #QUESTION MARK
82
+ 0x40 0x0040 #COMMERCIAL AT
83
+ 0x41 0x0041 #LATIN CAPITAL LETTER A
84
+ 0x42 0x0042 #LATIN CAPITAL LETTER B
85
+ 0x43 0x0043 #LATIN CAPITAL LETTER C
86
+ 0x44 0x0044 #LATIN CAPITAL LETTER D
87
+ 0x45 0x0045 #LATIN CAPITAL LETTER E
88
+ 0x46 0x0046 #LATIN CAPITAL LETTER F
89
+ 0x47 0x0047 #LATIN CAPITAL LETTER G
90
+ 0x48 0x0048 #LATIN CAPITAL LETTER H
91
+ 0x49 0x0049 #LATIN CAPITAL LETTER I
92
+ 0x4A 0x004A #LATIN CAPITAL LETTER J
93
+ 0x4B 0x004B #LATIN CAPITAL LETTER K
94
+ 0x4C 0x004C #LATIN CAPITAL LETTER L
95
+ 0x4D 0x004D #LATIN CAPITAL LETTER M
96
+ 0x4E 0x004E #LATIN CAPITAL LETTER N
97
+ 0x4F 0x004F #LATIN CAPITAL LETTER O
98
+ 0x50 0x0050 #LATIN CAPITAL LETTER P
99
+ 0x51 0x0051 #LATIN CAPITAL LETTER Q
100
+ 0x52 0x0052 #LATIN CAPITAL LETTER R
101
+ 0x53 0x0053 #LATIN CAPITAL LETTER S
102
+ 0x54 0x0054 #LATIN CAPITAL LETTER T
103
+ 0x55 0x0055 #LATIN CAPITAL LETTER U
104
+ 0x56 0x0056 #LATIN CAPITAL LETTER V
105
+ 0x57 0x0057 #LATIN CAPITAL LETTER W
106
+ 0x58 0x0058 #LATIN CAPITAL LETTER X
107
+ 0x59 0x0059 #LATIN CAPITAL LETTER Y
108
+ 0x5A 0x005A #LATIN CAPITAL LETTER Z
109
+ 0x5B 0x005B #LEFT SQUARE BRACKET
110
+ 0x5C 0x005C #REVERSE SOLIDUS
111
+ 0x5D 0x005D #RIGHT SQUARE BRACKET
112
+ 0x5E 0x005E #CIRCUMFLEX ACCENT
113
+ 0x5F 0x005F #LOW LINE
114
+ 0x60 0x0060 #GRAVE ACCENT
115
+ 0x61 0x0061 #LATIN SMALL LETTER A
116
+ 0x62 0x0062 #LATIN SMALL LETTER B
117
+ 0x63 0x0063 #LATIN SMALL LETTER C
118
+ 0x64 0x0064 #LATIN SMALL LETTER D
119
+ 0x65 0x0065 #LATIN SMALL LETTER E
120
+ 0x66 0x0066 #LATIN SMALL LETTER F
121
+ 0x67 0x0067 #LATIN SMALL LETTER G
122
+ 0x68 0x0068 #LATIN SMALL LETTER H
123
+ 0x69 0x0069 #LATIN SMALL LETTER I
124
+ 0x6A 0x006A #LATIN SMALL LETTER J
125
+ 0x6B 0x006B #LATIN SMALL LETTER K
126
+ 0x6C 0x006C #LATIN SMALL LETTER L
127
+ 0x6D 0x006D #LATIN SMALL LETTER M
128
+ 0x6E 0x006E #LATIN SMALL LETTER N
129
+ 0x6F 0x006F #LATIN SMALL LETTER O
130
+ 0x70 0x0070 #LATIN SMALL LETTER P
131
+ 0x71 0x0071 #LATIN SMALL LETTER Q
132
+ 0x72 0x0072 #LATIN SMALL LETTER R
133
+ 0x73 0x0073 #LATIN SMALL LETTER S
134
+ 0x74 0x0074 #LATIN SMALL LETTER T
135
+ 0x75 0x0075 #LATIN SMALL LETTER U
136
+ 0x76 0x0076 #LATIN SMALL LETTER V
137
+ 0x77 0x0077 #LATIN SMALL LETTER W
138
+ 0x78 0x0078 #LATIN SMALL LETTER X
139
+ 0x79 0x0079 #LATIN SMALL LETTER Y
140
+ 0x7A 0x007A #LATIN SMALL LETTER Z
141
+ 0x7B 0x007B #LEFT CURLY BRACKET
142
+ 0x7C 0x007C #VERTICAL LINE
143
+ 0x7D 0x007D #RIGHT CURLY BRACKET
144
+ 0x7E 0x007E #TILDE
145
+ 0x7F 0x007F #DELETE
146
+ 0x80 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS
147
+ 0x81 0x00C5 #LATIN CAPITAL LETTER A WITH RING ABOVE
148
+ 0x82 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA
149
+ 0x83 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE
150
+ 0x84 0x00D1 #LATIN CAPITAL LETTER N WITH TILDE
151
+ 0x85 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS
152
+ 0x86 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS
153
+ 0x87 0x00E1 #LATIN SMALL LETTER A WITH ACUTE
154
+ 0x88 0x00E0 #LATIN SMALL LETTER A WITH GRAVE
155
+ 0x89 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX
156
+ 0x8A 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS
157
+ 0x8B 0x00E3 #LATIN SMALL LETTER A WITH TILDE
158
+ 0x8C 0x00E5 #LATIN SMALL LETTER A WITH RING ABOVE
159
+ 0x8D 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA
160
+ 0x8E 0x00E9 #LATIN SMALL LETTER E WITH ACUTE
161
+ 0x8F 0x00E8 #LATIN SMALL LETTER E WITH GRAVE
162
+ 0x90 0x00EA #LATIN SMALL LETTER E WITH CIRCUMFLEX
163
+ 0x91 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS
164
+ 0x92 0x00ED #LATIN SMALL LETTER I WITH ACUTE
165
+ 0x93 0x00EC #LATIN SMALL LETTER I WITH GRAVE
166
+ 0x94 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX
167
+ 0x95 0x00EF #LATIN SMALL LETTER I WITH DIAERESIS
168
+ 0x96 0x00F1 #LATIN SMALL LETTER N WITH TILDE
169
+ 0x97 0x00F3 #LATIN SMALL LETTER O WITH ACUTE
170
+ 0x98 0x00F2 #LATIN SMALL LETTER O WITH GRAVE
171
+ 0x99 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX
172
+ 0x9A 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS
173
+ 0x9B 0x00F5 #LATIN SMALL LETTER O WITH TILDE
174
+ 0x9C 0x00FA #LATIN SMALL LETTER U WITH ACUTE
175
+ 0x9D 0x00F9 #LATIN SMALL LETTER U WITH GRAVE
176
+ 0x9E 0x00FB #LATIN SMALL LETTER U WITH CIRCUMFLEX
177
+ 0x9F 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS
178
+ 0xA0 0x2020 #DAGGER
179
+ 0xA1 0x00B0 #DEGREE SIGN
180
+ 0xA2 0x00A2 #CENT SIGN
181
+ 0xA3 0x00A3 #POUND SIGN
182
+ 0xA4 0x00A7 #SECTION SIGN
183
+ 0xA5 0x2022 #BULLET
184
+ 0xA6 0x00B6 #PILCROW SIGN
185
+ 0xA7 0x00DF #LATIN SMALL LETTER SHARP S
186
+ 0xA8 0x00AE #REGISTERED SIGN
187
+ 0xA9 0x00A9 #COPYRIGHT SIGN
188
+ 0xAA 0x2122 #TRADE MARK SIGN
189
+ 0xAB 0x00B4 #ACUTE ACCENT
190
+ 0xAC 0x00A8 #DIAERESIS
191
+ 0xAD 0x2260 #NOT EQUAL TO
192
+ 0xAE 0x00C6 #LATIN CAPITAL LIGATURE AE
193
+ 0xAF 0x00D8 #LATIN CAPITAL LETTER O WITH STROKE
194
+ 0xB0 0x221E #INFINITY
195
+ 0xB1 0x00B1 #PLUS-MINUS SIGN
196
+ 0xB2 0x2264 #LESS-THAN OR EQUAL TO
197
+ 0xB3 0x2265 #GREATER-THAN OR EQUAL TO
198
+ 0xB4 0x00A5 #YEN SIGN
199
+ 0xB5 0x00B5 #MICRO SIGN
200
+ 0xB6 0x2202 #PARTIAL DIFFERENTIAL
201
+ 0xB7 0x2211 #N-ARY SUMMATION
202
+ 0xB8 0x220F #N-ARY PRODUCT
203
+ 0xB9 0x03C0 #GREEK SMALL LETTER PI
204
+ 0xBA 0x222B #INTEGRAL
205
+ 0xBB 0x00AA #FEMININE ORDINAL INDICATOR
206
+ 0xBC 0x00BA #MASCULINE ORDINAL INDICATOR
207
+ 0xBD 0x2126 #OHM SIGN
208
+ 0xBE 0x00E6 #LATIN SMALL LIGATURE AE
209
+ 0xBF 0x00F8 #LATIN SMALL LETTER O WITH STROKE
210
+ 0xC0 0x00BF #INVERTED QUESTION MARK
211
+ 0xC1 0x00A1 #INVERTED EXCLAMATION MARK
212
+ 0xC2 0x00AC #NOT SIGN
213
+ 0xC3 0x221A #SQUARE ROOT
214
+ 0xC4 0x0192 #LATIN SMALL LETTER F WITH HOOK
215
+ 0xC5 0x2248 #ALMOST EQUAL TO
216
+ 0xC6 0x2206 #INCREMENT
217
+ 0xC7 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
218
+ 0xC8 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
219
+ 0xC9 0x2026 #HORIZONTAL ELLIPSIS
220
+ 0xCA 0x00A0 #NO-BREAK SPACE
221
+ 0xCB 0x00C0 #LATIN CAPITAL LETTER A WITH GRAVE
222
+ 0xCC 0x00C3 #LATIN CAPITAL LETTER A WITH TILDE
223
+ 0xCD 0x00D5 #LATIN CAPITAL LETTER O WITH TILDE
224
+ 0xCE 0x0152 #LATIN CAPITAL LIGATURE OE
225
+ 0xCF 0x0153 #LATIN SMALL LIGATURE OE
226
+ 0xD0 0x2013 #EN DASH
227
+ 0xD1 0x2014 #EM DASH
228
+ 0xD2 0x201C #LEFT DOUBLE QUOTATION MARK
229
+ 0xD3 0x201D #RIGHT DOUBLE QUOTATION MARK
230
+ 0xD4 0x2018 #LEFT SINGLE QUOTATION MARK
231
+ 0xD5 0x2019 #RIGHT SINGLE QUOTATION MARK
232
+ 0xD6 0x00F7 #DIVISION SIGN
233
+ 0xD7 0x25CA #LOZENGE
234
+ 0xD8 0x00FF #LATIN SMALL LETTER Y WITH DIAERESIS
235
+ 0xD9 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS
236
+ 0xDA 0x2044 #FRACTION SLASH
237
+ 0xDB 0x00A4 #CURRENCY SIGN
238
+ 0xDC 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
239
+ 0xDD 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
240
+ 0xDE 0xFB01 #LATIN SMALL LIGATURE FI
241
+ 0xDF 0xFB02 #LATIN SMALL LIGATURE FL
242
+ 0xE0 0x2021 #DOUBLE DAGGER
243
+ 0xE1 0x00B7 #MIDDLE DOT
244
+ 0xE2 0x201A #SINGLE LOW-9 QUOTATION MARK
245
+ 0xE3 0x201E #DOUBLE LOW-9 QUOTATION MARK
246
+ 0xE4 0x2030 #PER MILLE SIGN
247
+ 0xE5 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX
248
+ 0xE6 0x00CA #LATIN CAPITAL LETTER E WITH CIRCUMFLEX
249
+ 0xE7 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE
250
+ 0xE8 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS
251
+ 0xE9 0x00C8 #LATIN CAPITAL LETTER E WITH GRAVE
252
+ 0xEA 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE
253
+ 0xEB 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX
254
+ 0xEC 0x00CF #LATIN CAPITAL LETTER I WITH DIAERESIS
255
+ 0xED 0x00CC #LATIN CAPITAL LETTER I WITH GRAVE
256
+ 0xEE 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE
257
+ 0xEF 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX
258
+ 0xF0 #UNDEFINED
259
+ 0xF1 0x00D2 #LATIN CAPITAL LETTER O WITH GRAVE
260
+ 0xF2 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE
261
+ 0xF3 0x00DB #LATIN CAPITAL LETTER U WITH CIRCUMFLEX
262
+ 0xF4 0x00D9 #LATIN CAPITAL LETTER U WITH GRAVE
263
+ 0xF5 0x0131 #LATIN SMALL LETTER DOTLESS I
264
+ 0xF6 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT
265
+ 0xF7 0x02DC #SMALL TILDE
266
+ 0xF8 0x00AF #MACRON
267
+ 0xF9 0x02D8 #BREVE
268
+ 0xFA 0x02D9 #DOT ABOVE
269
+ 0xFB 0x02DA #RING ABOVE
270
+ 0xFC 0x00B8 #CEDILLA
271
+ 0xFD 0x02DD #DOUBLE ACUTE ACCENT
272
+ 0xFE 0x02DB #OGONEK
273
+ 0xFF 0x02C7 #CARON
274
+
275
+ 
@@ -0,0 +1,198 @@
1
+ #
2
+ # Name: ANSI X3.4-1968 (US-ASCII) with 0x60/0x27 as
3
+ # left/right single quotation mark to Unicode
4
+ # Unicode version: 3.2
5
+ # Table version: 1.0
6
+ # Table format: Format A
7
+ # Date: 2003 April 8
8
+ # Authors: Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/>
9
+ #
10
+ # General notes:
11
+ #
12
+ # The coded character set commonly known as "American Standard
13
+ # Code for Information Interchange (ASCII)" originated in the
14
+ # early 1960s international standardization project that led to
15
+ # ECMA-6 (1965) and ISO 646 (1972). When the American National
16
+ # Standards Institute adopted this specification as national
17
+ # standard X3.4 in 1968, it added a national provision for
18
+ # overloading the code positions 0x60 and 0x27 with the
19
+ # typographic characters left and right single quotation mark.
20
+ # This usage was not reflected in the international standard and
21
+ # other national adoptions of it, but become widely used in some
22
+ # communities in the United States and is now found in numerous
23
+ # historic and still even some contemporary English-language
24
+ # 7-bit ASCII text files. The Unicode Standard followed
25
+ # explicitly the international standard, in which 0x27 encodes
26
+ # the directionally neutral (vertical) character that is used as
27
+ # both an opening and closing quotation mark as well as an
28
+ # apostrophe on traditional typewriters, and where 0x60 is a
29
+ # spacing grave accent that matches the spacing acute accent
30
+ # found in ISO 8859-1 on position 0xb4.
31
+ #
32
+ # To facilitate the correct display and conversion of such ASCII
33
+ # documents with directional quotation marks to Unicode, this
34
+ # encoding table defines a 7-bit coded character set mapping
35
+ # that differs from ISO 646-IRV in that the characters 0x60 and
36
+ # 0x27 are mapped to Unicode's typographic directional quotation
37
+ # marks on U+2018 and U+2019, respectively.
38
+ #
39
+ # Notes:
40
+ #
41
+ # - This historic ASCII interpretation is also used in the left
42
+ # half of the PostScript StandardEncoding and (erroneously)
43
+ # ISOLatin1Encoding encoding vectors.
44
+ #
45
+ # - Unicode features a SINGLE HIGH-REVERSED-9 QUOTATION MARK
46
+ # (U+201B), whose provided example glyph has a slightly
47
+ # closer ressemblance to the compromise glyphs found in many
48
+ # historic US-ASCII fonts that try to represent both a left
49
+ # quotation mark and a grave accent. However, since U+201B
50
+ # is not actually intended to encode correct English
51
+ # typographic quotation conventions, this table maps 0x60 to
52
+ # the the correct English opening quotation mark U+2018
53
+ # instead (as did PostScript).
54
+ #
55
+ # References:
56
+ #
57
+ # - Markus Kuhn: ASCII and Unicode quotation marks.
58
+ # http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html
59
+ #
60
+ # - Jukka Korpela: Character histories: notes on some Ascii
61
+ # code positions.
62
+ # http://www.cs.tut.fi/~jkorpela/latin1/ascii-hist.html
63
+ #
64
+ # Format: Three tab-separated columns
65
+ # Column #1 is the ANSI X3.4 code (in hex as 0xXX)
66
+ # Column #2 is the Unicode (in hex as 0xXXXX)
67
+ # Column #3 the Unicode name (follows a comment sign, '#')
68
+ #
69
+ # The entries are in ANSI X3.4 order.
70
+ #
71
+ 0x00 0x0000 # NULL
72
+ 0x01 0x0001 # START OF HEADING
73
+ 0x02 0x0002 # START OF TEXT
74
+ 0x03 0x0003 # END OF TEXT
75
+ 0x04 0x0004 # END OF TRANSMISSION
76
+ 0x05 0x0005 # ENQUIRY
77
+ 0x06 0x0006 # ACKNOWLEDGE
78
+ 0x07 0x0007 # BELL
79
+ 0x08 0x0008 # BACKSPACE
80
+ 0x09 0x0009 # HORIZONTAL TABULATION
81
+ 0x0A 0x000A # LINE FEED
82
+ 0x0B 0x000B # VERTICAL TABULATION
83
+ 0x0C 0x000C # FORM FEED
84
+ 0x0D 0x000D # CARRIAGE RETURN
85
+ 0x0E 0x000E # SHIFT OUT
86
+ 0x0F 0x000F # SHIFT IN
87
+ 0x10 0x0010 # DATA LINK ESCAPE
88
+ 0x11 0x0011 # DEVICE CONTROL ONE
89
+ 0x12 0x0012 # DEVICE CONTROL TWO
90
+ 0x13 0x0013 # DEVICE CONTROL THREE
91
+ 0x14 0x0014 # DEVICE CONTROL FOUR
92
+ 0x15 0x0015 # NEGATIVE ACKNOWLEDGE
93
+ 0x16 0x0016 # SYNCHRONOUS IDLE
94
+ 0x17 0x0017 # END OF TRANSMISSION BLOCK
95
+ 0x18 0x0018 # CANCEL
96
+ 0x19 0x0019 # END OF MEDIUM
97
+ 0x1A 0x001A # SUBSTITUTE
98
+ 0x1B 0x001B # ESCAPE
99
+ 0x1C 0x001C # FILE SEPARATOR
100
+ 0x1D 0x001D # GROUP SEPARATOR
101
+ 0x1E 0x001E # RECORD SEPARATOR
102
+ 0x1F 0x001F # UNIT SEPARATOR
103
+ 0x20 0x0020 # SPACE
104
+ 0x21 0x0021 # EXCLAMATION MARK
105
+ 0x22 0x0022 # QUOTATION MARK
106
+ 0x23 0x0023 # NUMBER SIGN
107
+ 0x24 0x0024 # DOLLAR SIGN
108
+ 0x25 0x0025 # PERCENT SIGN
109
+ 0x26 0x0026 # AMPERSAND
110
+ 0x27 0x2019 # RIGHT SINGLE QUOTATION MARK
111
+ 0x28 0x0028 # LEFT PARENTHESIS
112
+ 0x29 0x0029 # RIGHT PARENTHESIS
113
+ 0x2A 0x002A # ASTERISK
114
+ 0x2B 0x002B # PLUS SIGN
115
+ 0x2C 0x002C # COMMA
116
+ 0x2D 0x002D # HYPHEN-MINUS
117
+ 0x2E 0x002E # FULL STOP
118
+ 0x2F 0x002F # SOLIDUS
119
+ 0x30 0x0030 # DIGIT ZERO
120
+ 0x31 0x0031 # DIGIT ONE
121
+ 0x32 0x0032 # DIGIT TWO
122
+ 0x33 0x0033 # DIGIT THREE
123
+ 0x34 0x0034 # DIGIT FOUR
124
+ 0x35 0x0035 # DIGIT FIVE
125
+ 0x36 0x0036 # DIGIT SIX
126
+ 0x37 0x0037 # DIGIT SEVEN
127
+ 0x38 0x0038 # DIGIT EIGHT
128
+ 0x39 0x0039 # DIGIT NINE
129
+ 0x3A 0x003A # COLON
130
+ 0x3B 0x003B # SEMICOLON
131
+ 0x3C 0x003C # LESS-THAN SIGN
132
+ 0x3D 0x003D # EQUALS SIGN
133
+ 0x3E 0x003E # GREATER-THAN SIGN
134
+ 0x3F 0x003F # QUESTION MARK
135
+ 0x40 0x0040 # COMMERCIAL AT
136
+ 0x41 0x0041 # LATIN CAPITAL LETTER A
137
+ 0x42 0x0042 # LATIN CAPITAL LETTER B
138
+ 0x43 0x0043 # LATIN CAPITAL LETTER C
139
+ 0x44 0x0044 # LATIN CAPITAL LETTER D
140
+ 0x45 0x0045 # LATIN CAPITAL LETTER E
141
+ 0x46 0x0046 # LATIN CAPITAL LETTER F
142
+ 0x47 0x0047 # LATIN CAPITAL LETTER G
143
+ 0x48 0x0048 # LATIN CAPITAL LETTER H
144
+ 0x49 0x0049 # LATIN CAPITAL LETTER I
145
+ 0x4A 0x004A # LATIN CAPITAL LETTER J
146
+ 0x4B 0x004B # LATIN CAPITAL LETTER K
147
+ 0x4C 0x004C # LATIN CAPITAL LETTER L
148
+ 0x4D 0x004D # LATIN CAPITAL LETTER M
149
+ 0x4E 0x004E # LATIN CAPITAL LETTER N
150
+ 0x4F 0x004F # LATIN CAPITAL LETTER O
151
+ 0x50 0x0050 # LATIN CAPITAL LETTER P
152
+ 0x51 0x0051 # LATIN CAPITAL LETTER Q
153
+ 0x52 0x0052 # LATIN CAPITAL LETTER R
154
+ 0x53 0x0053 # LATIN CAPITAL LETTER S
155
+ 0x54 0x0054 # LATIN CAPITAL LETTER T
156
+ 0x55 0x0055 # LATIN CAPITAL LETTER U
157
+ 0x56 0x0056 # LATIN CAPITAL LETTER V
158
+ 0x57 0x0057 # LATIN CAPITAL LETTER W
159
+ 0x58 0x0058 # LATIN CAPITAL LETTER X
160
+ 0x59 0x0059 # LATIN CAPITAL LETTER Y
161
+ 0x5A 0x005A # LATIN CAPITAL LETTER Z
162
+ 0x5B 0x005B # LEFT SQUARE BRACKET
163
+ 0x5C 0x005C # REVERSE SOLIDUS
164
+ 0x5D 0x005D # RIGHT SQUARE BRACKET
165
+ 0x5E 0x005E # CIRCUMFLEX ACCENT
166
+ 0x5F 0x005F # LOW LINE
167
+ 0x60 0x2018 # LEFT SINGLE QUOTATION MARK
168
+ 0x61 0x0061 # LATIN SMALL LETTER A
169
+ 0x62 0x0062 # LATIN SMALL LETTER B
170
+ 0x63 0x0063 # LATIN SMALL LETTER C
171
+ 0x64 0x0064 # LATIN SMALL LETTER D
172
+ 0x65 0x0065 # LATIN SMALL LETTER E
173
+ 0x66 0x0066 # LATIN SMALL LETTER F
174
+ 0x67 0x0067 # LATIN SMALL LETTER G
175
+ 0x68 0x0068 # LATIN SMALL LETTER H
176
+ 0x69 0x0069 # LATIN SMALL LETTER I
177
+ 0x6A 0x006A # LATIN SMALL LETTER J
178
+ 0x6B 0x006B # LATIN SMALL LETTER K
179
+ 0x6C 0x006C # LATIN SMALL LETTER L
180
+ 0x6D 0x006D # LATIN SMALL LETTER M
181
+ 0x6E 0x006E # LATIN SMALL LETTER N
182
+ 0x6F 0x006F # LATIN SMALL LETTER O
183
+ 0x70 0x0070 # LATIN SMALL LETTER P
184
+ 0x71 0x0071 # LATIN SMALL LETTER Q
185
+ 0x72 0x0072 # LATIN SMALL LETTER R
186
+ 0x73 0x0073 # LATIN SMALL LETTER S
187
+ 0x74 0x0074 # LATIN SMALL LETTER T
188
+ 0x75 0x0075 # LATIN SMALL LETTER U
189
+ 0x76 0x0076 # LATIN SMALL LETTER V
190
+ 0x77 0x0077 # LATIN SMALL LETTER W
191
+ 0x78 0x0078 # LATIN SMALL LETTER X
192
+ 0x79 0x0079 # LATIN SMALL LETTER Y
193
+ 0x7A 0x007A # LATIN SMALL LETTER Z
194
+ 0x7B 0x007B # LEFT CURLY BRACKET
195
+ 0x7C 0x007C # VERTICAL LINE
196
+ 0x7D 0x007D # RIGHT CURLY BRACKET
197
+ 0x7E 0x007E # TILDE
198
+ 0x7F 0x007F # DELETE