asciidammit 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -1
- data/asciidammit.gemspec +1 -0
- data/lib/asciidammit/version.rb +1 -1
- data/lib/asciidammit.rb +111 -36
- data/lib/charmap.msgpack +1 -0
- metadata +20 -4
- data/lib/charmap.yaml +0 -2026
data/Gemfile
CHANGED
data/asciidammit.gemspec
CHANGED
data/lib/asciidammit/version.rb
CHANGED
data/lib/asciidammit.rb
CHANGED
@@ -1,45 +1,110 @@
|
|
1
1
|
require 'open-uri'
|
2
|
-
require '
|
2
|
+
require 'csv'
|
3
|
+
require 'msgpack'
|
3
4
|
|
4
5
|
module AsciiDammit
|
5
|
-
DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.
|
6
|
+
DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.msgpack')
|
7
|
+
MANUAL = {
|
8
|
+
'DIVISION SIGN' => '/',
|
9
|
+
'COPYRIGHT SIGN' => '(c)',
|
10
|
+
'SOFT HYPHEN' => '-',
|
11
|
+
'REGISTERED SIGN' => '(r)',
|
12
|
+
'DEGREE SIGN' => 'o',
|
13
|
+
'PLUS-MINUS SIGN' => '+/-',
|
14
|
+
'MULTIPLICATION SIGN' => '*',
|
15
|
+
'INVERTED EXCLAMATION MARK' => '!',
|
16
|
+
'ACUTE ACCENT' => "'",
|
17
|
+
'HYPHEN' => '-',
|
18
|
+
'NON-BREAKING HYPHEN' => '-',
|
19
|
+
'EN DASH' => '-',
|
20
|
+
'EM DASH' => '-',
|
21
|
+
'HORIZONTAL BAR' => '-',
|
22
|
+
'HORIZONTAL ELLIPSIS' => '...',
|
23
|
+
}
|
24
|
+
|
6
25
|
def update
|
7
|
-
database =
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:
|
26
|
+
database = {}
|
27
|
+
unmatched = []
|
28
|
+
CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
|
29
|
+
next if row[1] == '<control>'
|
30
|
+
|
31
|
+
codepoint = {
|
32
|
+
:codepoint => row[0],
|
33
|
+
:char => row[0].length == 4 && row[0] >= '0020' && row[0] <= '007E' ? row[0].to_i(16).chr : nil,
|
34
|
+
:name => row[1],
|
35
|
+
:category => row[2],
|
36
|
+
:decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
|
37
|
+
:decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
|
38
|
+
:numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
|
39
|
+
:upcase => row[12] ? row[12].intern : nil,
|
40
|
+
:downcase => row[13] ? row[13].intern : nil,
|
18
41
|
}
|
19
|
-
data[:decomp] = nil if data[:decomp].size == 0
|
20
42
|
|
21
|
-
|
43
|
+
# sorry guys, but I really don't know what to make of these
|
44
|
+
next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
|
45
|
+
SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
|
46
|
+
OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
|
47
|
+
LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
|
48
|
+
next if codepoint[:name] =~ /CANADIAN SYLLABICS /
|
49
|
+
next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
|
50
|
+
next if codepoint[:name] =~ /BOX DRAWINGS /
|
51
|
+
next if codepoint[:name] =~ /MUSICAL SYMBOL /
|
52
|
+
next if codepoint[:name] =~ /CJK COMPATIBILITY /
|
53
|
+
|
54
|
+
if codepoint[:char].nil?
|
55
|
+
name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')
|
56
|
+
|
57
|
+
codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
|
58
|
+
codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
|
59
|
+
codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
|
60
|
+
codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
|
61
|
+
codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
|
62
|
+
codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
|
63
|
+
codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
|
64
|
+
end
|
65
|
+
|
66
|
+
codepoint[:char] ||= MANUAL[codepoint[:name]]
|
67
|
+
|
68
|
+
database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
|
22
69
|
}
|
23
70
|
|
24
71
|
database.each_pair{|k, v|
|
25
|
-
next unless k
|
72
|
+
next unless k.is_a?(Symbol) && !v[:char]
|
26
73
|
|
27
74
|
case v[:category]
|
28
75
|
when 'Mn'
|
29
|
-
v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char]
|
76
|
+
v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]
|
30
77
|
|
31
78
|
when 'Lu', 'Ll'
|
32
|
-
if v[:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
79
|
+
if v[:decomp]
|
80
|
+
v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
|
81
|
+
v[:char] = nil if v[:char] == ''
|
82
|
+
else
|
83
|
+
names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
|
84
|
+
names.size.downto(1){|n|
|
85
|
+
names.permutation(n).to_a.each{|name|
|
86
|
+
name = "LATIN " + name.join(' ')
|
87
|
+
v[:char] = database[name][:char] if database[name] && database[name][:char]
|
88
|
+
break if v[:char]
|
89
|
+
}
|
90
|
+
break if v[:char]
|
91
|
+
}
|
39
92
|
end
|
40
93
|
|
41
|
-
when 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
|
42
|
-
|
94
|
+
when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
|
95
|
+
names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
|
96
|
+
RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
|
97
|
+
QUADRANT UPPER LEFT LOWER RIGHT
|
98
|
+
ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS}.include?(w)}
|
99
|
+
STDOUT.flush
|
100
|
+
names.size.downto(1){|n|
|
101
|
+
names.permutation(n).to_a.each{|name|
|
102
|
+
name = name.join(' ')
|
103
|
+
v[:char] = database[name][:char] if database[name] && database[name][:char]
|
104
|
+
break if v[:char]
|
105
|
+
}
|
106
|
+
break if v[:char]
|
107
|
+
}
|
43
108
|
|
44
109
|
when 'Lt'
|
45
110
|
m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
|
@@ -50,7 +115,7 @@ module AsciiDammit
|
|
50
115
|
|
51
116
|
when 'Nl', 'No', 'Nd'
|
52
117
|
if v[:decomp]
|
53
|
-
v[:char] = v[:decomp].collect{|c| database[c][:char]
|
118
|
+
v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
|
54
119
|
v[:char] = nil if v[:char] == ''
|
55
120
|
end
|
56
121
|
if v[:char].nil?
|
@@ -65,28 +130,38 @@ module AsciiDammit
|
|
65
130
|
when 'Zs'
|
66
131
|
v[:char] = ' '
|
67
132
|
|
68
|
-
when 'Sk', 'Cf', '
|
69
|
-
|
133
|
+
when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
|
134
|
+
v[:ignore] = true
|
70
135
|
|
71
136
|
else
|
72
137
|
raise "Unhandled character category #{v[:category]}"
|
73
138
|
end
|
139
|
+
|
140
|
+
if v[:char].nil?
|
141
|
+
v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
|
142
|
+
v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
|
143
|
+
end
|
144
|
+
|
145
|
+
#puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore]
|
146
|
+
|
147
|
+
unmatched << v unless v[:char]
|
74
148
|
}
|
75
|
-
database.
|
76
|
-
next unless k
|
77
|
-
database[k
|
149
|
+
database.keys.each{|k|
|
150
|
+
next unless k.is_a?(Symbol)
|
151
|
+
database[eval("\"\\u{#{k}}\"")] = database[k][:char]
|
78
152
|
}
|
79
|
-
database =
|
80
|
-
|
153
|
+
database[:unmatched] = unmatched
|
154
|
+
puts "#{database[:unmatched].size} unmatched"
|
155
|
+
File.open(DATABASEFILE, 'wb'){|f| f.write(database.reject{|k, v| v.is_a?(Hash)}.to_msgpack) }
|
81
156
|
end
|
82
157
|
module_function :update
|
83
158
|
|
84
159
|
update unless File.exists?(DATABASEFILE)
|
85
|
-
DATABASE =
|
160
|
+
DATABASE = File.open(DATABASEFILE, 'rb'){|f| MessagePack.unpack(f.read)}
|
86
161
|
end
|
87
162
|
|
88
163
|
class String
|
89
164
|
def asciidammit
|
90
|
-
self.split(//).collect{|c| AsciiDammit::DATABASE[c
|
165
|
+
self.split(//).collect{|c| a = AsciiDammit::DATABASE[c]; raise "Unhandled char #{c.inspect} (#{a.inspect})" unless a; a.to_s}.join('')
|
91
166
|
end
|
92
167
|
end
|