asciidammit 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -6,5 +6,6 @@ source 'https://rubygems.org'
6
6
  # Specify your gem's dependencies in to_ascii_latex.gemspec
7
7
  gemspec
8
8
 
9
- gem'rake'
9
+ gem 'rake'
10
+ gem 'msgpack'
10
11
 
data/asciidammit.gemspec CHANGED
@@ -20,4 +20,5 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "msgpack"
23
24
  end
@@ -1,3 +1,3 @@
1
1
  module AsciiDammit
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/asciidammit.rb CHANGED
@@ -1,45 +1,110 @@
1
1
  require 'open-uri'
2
- require 'yaml'
2
+ require 'csv'
3
+ require 'msgpack'
3
4
 
4
5
  module AsciiDammit
5
- DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.yaml')
6
+ DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.msgpack')
7
+ MANUAL = {
8
+ 'DIVISION SIGN' => '/',
9
+ 'COPYRIGHT SIGN' => '(c)',
10
+ 'SOFT HYPHEN' => '-',
11
+ 'REGISTERED SIGN' => '(r)',
12
+ 'DEGREE SIGN' => 'o',
13
+ 'PLUS-MINUS SIGN' => '+/-',
14
+ 'MULTIPLICATION SIGN' => '*',
15
+ 'INVERTED EXCLAMATION MARK' => '!',
16
+ 'ACUTE ACCENT' => "'",
17
+ 'HYPHEN' => '-',
18
+ 'NON-BREAKING HYPHEN' => '-',
19
+ 'EN DASH' => '-',
20
+ 'EM DASH' => '-',
21
+ 'HORIZONTAL BAR' => '-',
22
+ 'HORIZONTAL ELLIPSIS' => '...',
23
+ }
24
+
6
25
  def update
7
- database = Hash.new({})
8
- open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').readlines.each{|line|
9
- data = line.strip.split(';')
10
- data = {
11
- :key => data[0],
12
- :char => data[0].length == 4 && data[0] >= '0020' && data[0] <= '007E' ? data[0].to_i(16).chr : nil,
13
- :name => data[1],
14
- :category => data[2],
15
- :decomp => data[5].split.select{|v| v =~ /^[0-9A-Z]+$/},
16
- :decimal => data[6].length > 0 ? Integer(data[6]) : nil,
17
- :numeric => data[8] =~ /^[0-9]+$/ ? Integer(data[8]) : nil
26
+ database = {}
27
+ unmatched = []
28
+ CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
29
+ next if row[1] == '<control>'
30
+
31
+ codepoint = {
32
+ :codepoint => row[0],
33
+ :char => row[0].length == 4 && row[0] >= '0020' && row[0] <= '007E' ? row[0].to_i(16).chr : nil,
34
+ :name => row[1],
35
+ :category => row[2],
36
+ :decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
37
+ :decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
38
+ :numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
39
+ :upcase => row[12] ? row[12].intern : nil,
40
+ :downcase => row[13] ? row[13].intern : nil,
18
41
  }
19
- data[:decomp] = nil if data[:decomp].size == 0
20
42
 
21
- database[data[:key]] = database[data[:name]] = data
43
+ # sorry guys, but I really don't know what to make of these
44
+ next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
45
+ SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
46
+ OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
47
+ LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
48
+ next if codepoint[:name] =~ /CANADIAN SYLLABICS /
49
+ next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
50
+ next if codepoint[:name] =~ /BOX DRAWINGS /
51
+ next if codepoint[:name] =~ /MUSICAL SYMBOL /
52
+ next if codepoint[:name] =~ /CJK COMPATIBILITY /
53
+
54
+ if codepoint[:char].nil?
55
+ name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')
56
+
57
+ codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
58
+ codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
59
+ codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
60
+ codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
61
+ codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
62
+ codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
63
+ codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
64
+ end
65
+
66
+ codepoint[:char] ||= MANUAL[codepoint[:name]]
67
+
68
+ database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
22
69
  }
23
70
 
24
71
  database.each_pair{|k, v|
25
- next unless k =~ /^[0-9A-Z]+$/
72
+ next unless k.is_a?(Symbol) && !v[:char]
26
73
 
27
74
  case v[:category]
28
75
  when 'Mn'
29
- v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char]
76
+ v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]
30
77
 
31
78
  when 'Lu', 'Ll'
32
- if v[:char].nil?
33
- if v[:decomp]
34
- v[:char] = database[v[:decomp][0]][:char]
35
- else
36
- m = v[:name].match(/(CAPITAL|SMALL) LETTER [A-Z]+ (.)$/)
37
- v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char] if m
38
- end
79
+ if v[:decomp]
80
+ v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
81
+ v[:char] = nil if v[:char] == ''
82
+ else
83
+ names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
84
+ names.size.downto(1){|n|
85
+ names.permutation(n).to_a.each{|name|
86
+ name = "LATIN " + name.join(' ')
87
+ v[:char] = database[name][:char] if database[name] && database[name][:char]
88
+ break if v[:char]
89
+ }
90
+ break if v[:char]
91
+ }
39
92
  end
40
93
 
41
- when 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
42
- v[:char] = database[v[:name].gsub(/.* /, '')][:char] unless v[:char]
94
+ when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
95
+ names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
96
+ RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
97
+ QUADRANT UPPER LEFT LOWER RIGHT
98
+ ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS}.include?(w)}
99
+ STDOUT.flush
100
+ names.size.downto(1){|n|
101
+ names.permutation(n).to_a.each{|name|
102
+ name = name.join(' ')
103
+ v[:char] = database[name][:char] if database[name] && database[name][:char]
104
+ break if v[:char]
105
+ }
106
+ break if v[:char]
107
+ }
43
108
 
44
109
  when 'Lt'
45
110
  m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
@@ -50,7 +115,7 @@ module AsciiDammit
50
115
 
51
116
  when 'Nl', 'No', 'Nd'
52
117
  if v[:decomp]
53
- v[:char] = v[:decomp].collect{|c| database[c][:char].to_s}.join('')
118
+ v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
54
119
  v[:char] = nil if v[:char] == ''
55
120
  end
56
121
  if v[:char].nil?
@@ -65,28 +130,38 @@ module AsciiDammit
65
130
  when 'Zs'
66
131
  v[:char] = ' '
67
132
 
68
- when 'Sk', 'Cf', 'Lm', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
69
- #
133
+ when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
134
+ v[:ignore] = true
70
135
 
71
136
  else
72
137
  raise "Unhandled character category #{v[:category]}"
73
138
  end
139
+
140
+ if v[:char].nil?
141
+ v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
142
+ v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
143
+ end
144
+
145
+ #puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore]
146
+
147
+ unmatched << v unless v[:char]
74
148
  }
75
- database.dup.each_pair{|k, v|
76
- next unless k =~ /^[0-9A-Z]+$/ && v[:char]
77
- database[k.to_i(16)] = v[:char]
149
+ database.keys.each{|k|
150
+ next unless k.is_a?(Symbol)
151
+ database[eval("\"\\u{#{k}}\"")] = database[k][:char]
78
152
  }
79
- database = database.reject{|k, v| v.is_a?(Hash)}
80
- File.open(DATABASEFILE, "wb", :encoding => 'utf-8'){|f| f.write(database.to_yaml) }
153
+ database[:unmatched] = unmatched
154
+ puts "#{database[:unmatched].size} unmatched"
155
+ File.open(DATABASEFILE, 'wb'){|f| f.write(database.reject{|k, v| v.is_a?(Hash)}.to_msgpack) }
81
156
  end
82
157
  module_function :update
83
158
 
84
159
  update unless File.exists?(DATABASEFILE)
85
- DATABASE = YAML.load_file(DATABASEFILE)
160
+ DATABASE = File.open(DATABASEFILE, 'rb'){|f| MessagePack.unpack(f.read)}
86
161
  end
87
162
 
88
163
  class String
89
164
  def asciidammit
90
- self.split(//).collect{|c| AsciiDammit::DATABASE[c[0].ord].to_s}.join('')
165
+ self.split(//).collect{|c| a = AsciiDammit::DATABASE[c]; raise "Unhandled char #{c.inspect} (#{a.inspect})" unless a; a.to_s}.join('')
91
166
  end
92
167
  end