asciidammit 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -1
- data/asciidammit.gemspec +1 -0
- data/lib/asciidammit/version.rb +1 -1
- data/lib/asciidammit.rb +111 -36
- data/lib/charmap.msgpack +1 -0
- metadata +20 -4
- data/lib/charmap.yaml +0 -2026
data/Gemfile
CHANGED
data/asciidammit.gemspec
CHANGED
data/lib/asciidammit/version.rb
CHANGED
data/lib/asciidammit.rb
CHANGED
@@ -1,45 +1,110 @@
|
|
1
1
|
require 'open-uri'
|
2
|
-
require '
|
2
|
+
require 'csv'
|
3
|
+
require 'msgpack'
|
3
4
|
|
4
5
|
module AsciiDammit
|
5
|
-
DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.
|
6
|
+
DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.msgpack')
|
7
|
+
MANUAL = {
|
8
|
+
'DIVISION SIGN' => '/',
|
9
|
+
'COPYRIGHT SIGN' => '(c)',
|
10
|
+
'SOFT HYPHEN' => '-',
|
11
|
+
'REGISTERED SIGN' => '(r)',
|
12
|
+
'DEGREE SIGN' => 'o',
|
13
|
+
'PLUS-MINUS SIGN' => '+/-',
|
14
|
+
'MULTIPLICATION SIGN' => '*',
|
15
|
+
'INVERTED EXCLAMATION MARK' => '!',
|
16
|
+
'ACUTE ACCENT' => "'",
|
17
|
+
'HYPHEN' => '-',
|
18
|
+
'NON-BREAKING HYPHEN' => '-',
|
19
|
+
'EN DASH' => '-',
|
20
|
+
'EM DASH' => '-',
|
21
|
+
'HORIZONTAL BAR' => '-',
|
22
|
+
'HORIZONTAL ELLIPSIS' => '...',
|
23
|
+
}
|
24
|
+
|
6
25
|
def update
|
7
|
-
database =
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:
|
16
|
-
:
|
17
|
-
:
|
26
|
+
database = {}
|
27
|
+
unmatched = []
|
28
|
+
CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
|
29
|
+
next if row[1] == '<control>'
|
30
|
+
|
31
|
+
codepoint = {
|
32
|
+
:codepoint => row[0],
|
33
|
+
:char => row[0].length == 4 && row[0] >= '0020' && row[0] <= '007E' ? row[0].to_i(16).chr : nil,
|
34
|
+
:name => row[1],
|
35
|
+
:category => row[2],
|
36
|
+
:decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
|
37
|
+
:decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
|
38
|
+
:numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
|
39
|
+
:upcase => row[12] ? row[12].intern : nil,
|
40
|
+
:downcase => row[13] ? row[13].intern : nil,
|
18
41
|
}
|
19
|
-
data[:decomp] = nil if data[:decomp].size == 0
|
20
42
|
|
21
|
-
|
43
|
+
# sorry guys, but I really don't know what to make of these
|
44
|
+
next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
|
45
|
+
SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
|
46
|
+
OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
|
47
|
+
LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
|
48
|
+
next if codepoint[:name] =~ /CANADIAN SYLLABICS /
|
49
|
+
next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
|
50
|
+
next if codepoint[:name] =~ /BOX DRAWINGS /
|
51
|
+
next if codepoint[:name] =~ /MUSICAL SYMBOL /
|
52
|
+
next if codepoint[:name] =~ /CJK COMPATIBILITY /
|
53
|
+
|
54
|
+
if codepoint[:char].nil?
|
55
|
+
name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')
|
56
|
+
|
57
|
+
codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
|
58
|
+
codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
|
59
|
+
codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
|
60
|
+
codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
|
61
|
+
codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
|
62
|
+
codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
|
63
|
+
codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
|
64
|
+
end
|
65
|
+
|
66
|
+
codepoint[:char] ||= MANUAL[codepoint[:name]]
|
67
|
+
|
68
|
+
database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
|
22
69
|
}
|
23
70
|
|
24
71
|
database.each_pair{|k, v|
|
25
|
-
next unless k
|
72
|
+
next unless k.is_a?(Symbol) && !v[:char]
|
26
73
|
|
27
74
|
case v[:category]
|
28
75
|
when 'Mn'
|
29
|
-
v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char]
|
76
|
+
v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]
|
30
77
|
|
31
78
|
when 'Lu', 'Ll'
|
32
|
-
if v[:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
79
|
+
if v[:decomp]
|
80
|
+
v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
|
81
|
+
v[:char] = nil if v[:char] == ''
|
82
|
+
else
|
83
|
+
names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
|
84
|
+
names.size.downto(1){|n|
|
85
|
+
names.permutation(n).to_a.each{|name|
|
86
|
+
name = "LATIN " + name.join(' ')
|
87
|
+
v[:char] = database[name][:char] if database[name] && database[name][:char]
|
88
|
+
break if v[:char]
|
89
|
+
}
|
90
|
+
break if v[:char]
|
91
|
+
}
|
39
92
|
end
|
40
93
|
|
41
|
-
when 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
|
42
|
-
|
94
|
+
when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
|
95
|
+
names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
|
96
|
+
RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
|
97
|
+
QUADRANT UPPER LEFT LOWER RIGHT
|
98
|
+
ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS}.include?(w)}
|
99
|
+
STDOUT.flush
|
100
|
+
names.size.downto(1){|n|
|
101
|
+
names.permutation(n).to_a.each{|name|
|
102
|
+
name = name.join(' ')
|
103
|
+
v[:char] = database[name][:char] if database[name] && database[name][:char]
|
104
|
+
break if v[:char]
|
105
|
+
}
|
106
|
+
break if v[:char]
|
107
|
+
}
|
43
108
|
|
44
109
|
when 'Lt'
|
45
110
|
m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
|
@@ -50,7 +115,7 @@ module AsciiDammit
|
|
50
115
|
|
51
116
|
when 'Nl', 'No', 'Nd'
|
52
117
|
if v[:decomp]
|
53
|
-
v[:char] = v[:decomp].collect{|c| database[c][:char]
|
118
|
+
v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
|
54
119
|
v[:char] = nil if v[:char] == ''
|
55
120
|
end
|
56
121
|
if v[:char].nil?
|
@@ -65,28 +130,38 @@ module AsciiDammit
|
|
65
130
|
when 'Zs'
|
66
131
|
v[:char] = ' '
|
67
132
|
|
68
|
-
when 'Sk', 'Cf', '
|
69
|
-
|
133
|
+
when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
|
134
|
+
v[:ignore] = true
|
70
135
|
|
71
136
|
else
|
72
137
|
raise "Unhandled character category #{v[:category]}"
|
73
138
|
end
|
139
|
+
|
140
|
+
if v[:char].nil?
|
141
|
+
v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
|
142
|
+
v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
|
143
|
+
end
|
144
|
+
|
145
|
+
#puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore]
|
146
|
+
|
147
|
+
unmatched << v unless v[:char]
|
74
148
|
}
|
75
|
-
database.
|
76
|
-
next unless k
|
77
|
-
database[k
|
149
|
+
database.keys.each{|k|
|
150
|
+
next unless k.is_a?(Symbol)
|
151
|
+
database[eval("\"\\u{#{k}}\"")] = database[k][:char]
|
78
152
|
}
|
79
|
-
database =
|
80
|
-
|
153
|
+
database[:unmatched] = unmatched
|
154
|
+
puts "#{database[:unmatched].size} unmatched"
|
155
|
+
File.open(DATABASEFILE, 'wb'){|f| f.write(database.reject{|k, v| v.is_a?(Hash)}.to_msgpack) }
|
81
156
|
end
|
82
157
|
module_function :update
|
83
158
|
|
84
159
|
update unless File.exists?(DATABASEFILE)
|
85
|
-
DATABASE =
|
160
|
+
DATABASE = File.open(DATABASEFILE, 'rb'){|f| MessagePack.unpack(f.read)}
|
86
161
|
end
|
87
162
|
|
88
163
|
class String
|
89
164
|
def asciidammit
|
90
|
-
self.split(//).collect{|c| AsciiDammit::DATABASE[c
|
165
|
+
self.split(//).collect{|c| a = AsciiDammit::DATABASE[c]; raise "Unhandled char #{c.inspect} (#{a.inspect})" unless a; a.to_s}.join('')
|
91
166
|
end
|
92
167
|
end
|