asciidammit 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -6,5 +6,6 @@ source 'https://rubygems.org'
6
6
  # Specify your gem's dependencies in to_ascii_latex.gemspec
7
7
  gemspec
8
8
 
9
- gem'rake'
9
+ gem 'rake'
10
+ gem 'msgpack'
10
11
 
data/asciidammit.gemspec CHANGED
@@ -20,4 +20,5 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "msgpack"
23
24
  end
@@ -1,3 +1,3 @@
1
1
  module AsciiDammit
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/asciidammit.rb CHANGED
@@ -1,45 +1,110 @@
1
1
  require 'open-uri'
2
- require 'yaml'
2
+ require 'csv'
3
+ require 'msgpack'
3
4
 
4
5
  module AsciiDammit
5
- DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.yaml')
6
+ DATABASEFILE = File.join(File.dirname(__FILE__), 'charmap.msgpack')
7
+ MANUAL = {
8
+ 'DIVISION SIGN' => '/',
9
+ 'COPYRIGHT SIGN' => '(c)',
10
+ 'SOFT HYPHEN' => '-',
11
+ 'REGISTERED SIGN' => '(r)',
12
+ 'DEGREE SIGN' => 'o',
13
+ 'PLUS-MINUS SIGN' => '+/-',
14
+ 'MULTIPLICATION SIGN' => '*',
15
+ 'INVERTED EXCLAMATION MARK' => '!',
16
+ 'ACUTE ACCENT' => "'",
17
+ 'HYPHEN' => '-',
18
+ 'NON-BREAKING HYPHEN' => '-',
19
+ 'EN DASH' => '-',
20
+ 'EM DASH' => '-',
21
+ 'HORIZONTAL BAR' => '-',
22
+ 'HORIZONTAL ELLIPSIS' => '...',
23
+ }
24
+
6
25
  def update
7
- database = Hash.new({})
8
- open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').readlines.each{|line|
9
- data = line.strip.split(';')
10
- data = {
11
- :key => data[0],
12
- :char => data[0].length == 4 && data[0] >= '0020' && data[0] <= '007E' ? data[0].to_i(16).chr : nil,
13
- :name => data[1],
14
- :category => data[2],
15
- :decomp => data[5].split.select{|v| v =~ /^[0-9A-Z]+$/},
16
- :decimal => data[6].length > 0 ? Integer(data[6]) : nil,
17
- :numeric => data[8] =~ /^[0-9]+$/ ? Integer(data[8]) : nil
26
+ database = {}
27
+ unmatched = []
28
+ CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
29
+ next if row[1] == '<control>'
30
+
31
+ codepoint = {
32
+ :codepoint => row[0],
33
+ :char => row[0].length == 4 && row[0] >= '0020' && row[0] <= '007E' ? row[0].to_i(16).chr : nil,
34
+ :name => row[1],
35
+ :category => row[2],
36
+ :decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
37
+ :decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
38
+ :numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
39
+ :upcase => row[12] ? row[12].intern : nil,
40
+ :downcase => row[13] ? row[13].intern : nil,
18
41
  }
19
- data[:decomp] = nil if data[:decomp].size == 0
20
42
 
21
- database[data[:key]] = database[data[:name]] = data
43
+ # sorry guys, but I really don't know what to make of these
44
+ next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
45
+ SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
46
+ OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
47
+ LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
48
+ next if codepoint[:name] =~ /CANADIAN SYLLABICS /
49
+ next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
50
+ next if codepoint[:name] =~ /BOX DRAWINGS /
51
+ next if codepoint[:name] =~ /MUSICAL SYMBOL /
52
+ next if codepoint[:name] =~ /CJK COMPATIBILITY /
53
+
54
+ if codepoint[:char].nil?
55
+ name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')
56
+
57
+ codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
58
+ codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
59
+ codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
60
+ codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
61
+ codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
62
+ codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
63
+ codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
64
+ end
65
+
66
+ codepoint[:char] ||= MANUAL[codepoint[:name]]
67
+
68
+ database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
22
69
  }
23
70
 
24
71
  database.each_pair{|k, v|
25
- next unless k =~ /^[0-9A-Z]+$/
72
+ next unless k.is_a?(Symbol) && !v[:char]
26
73
 
27
74
  case v[:category]
28
75
  when 'Mn'
29
- v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char]
76
+ v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]
30
77
 
31
78
  when 'Lu', 'Ll'
32
- if v[:char].nil?
33
- if v[:decomp]
34
- v[:char] = database[v[:decomp][0]][:char]
35
- else
36
- m = v[:name].match(/(CAPITAL|SMALL) LETTER [A-Z]+ (.)$/)
37
- v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char] if m
38
- end
79
+ if v[:decomp]
80
+ v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
81
+ v[:char] = nil if v[:char] == ''
82
+ else
83
+ names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
84
+ names.size.downto(1){|n|
85
+ names.permutation(n).to_a.each{|name|
86
+ name = "LATIN " + name.join(' ')
87
+ v[:char] = database[name][:char] if database[name] && database[name][:char]
88
+ break if v[:char]
89
+ }
90
+ break if v[:char]
91
+ }
39
92
  end
40
93
 
41
- when 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
42
- v[:char] = database[v[:name].gsub(/.* /, '')][:char] unless v[:char]
94
+ when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
95
+ names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
96
+ RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
97
+ QUADRANT UPPER LEFT LOWER RIGHT
98
+ ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS}.include?(w)}
99
+ STDOUT.flush
100
+ names.size.downto(1){|n|
101
+ names.permutation(n).to_a.each{|name|
102
+ name = name.join(' ')
103
+ v[:char] = database[name][:char] if database[name] && database[name][:char]
104
+ break if v[:char]
105
+ }
106
+ break if v[:char]
107
+ }
43
108
 
44
109
  when 'Lt'
45
110
  m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
@@ -50,7 +115,7 @@ module AsciiDammit
50
115
 
51
116
  when 'Nl', 'No', 'Nd'
52
117
  if v[:decomp]
53
- v[:char] = v[:decomp].collect{|c| database[c][:char].to_s}.join('')
118
+ v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
54
119
  v[:char] = nil if v[:char] == ''
55
120
  end
56
121
  if v[:char].nil?
@@ -65,28 +130,38 @@ module AsciiDammit
65
130
  when 'Zs'
66
131
  v[:char] = ' '
67
132
 
68
- when 'Sk', 'Cf', 'Lm', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
69
- #
133
+ when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
134
+ v[:ignore] = true
70
135
 
71
136
  else
72
137
  raise "Unhandled character category #{v[:category]}"
73
138
  end
139
+
140
+ if v[:char].nil?
141
+ v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
142
+ v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
143
+ end
144
+
145
+ #puts "#{v[:name]} (#{v[:category]})" unless v[:char] || v[:ignore]
146
+
147
+ unmatched << v unless v[:char]
74
148
  }
75
- database.dup.each_pair{|k, v|
76
- next unless k =~ /^[0-9A-Z]+$/ && v[:char]
77
- database[k.to_i(16)] = v[:char]
149
+ database.keys.each{|k|
150
+ next unless k.is_a?(Symbol)
151
+ database[eval("\"\\u{#{k}}\"")] = database[k][:char]
78
152
  }
79
- database = database.reject{|k, v| v.is_a?(Hash)}
80
- File.open(DATABASEFILE, "wb", :encoding => 'utf-8'){|f| f.write(database.to_yaml) }
153
+ database[:unmatched] = unmatched
154
+ puts "#{database[:unmatched].size} unmatched"
155
+ File.open(DATABASEFILE, 'wb'){|f| f.write(database.reject{|k, v| v.is_a?(Hash)}.to_msgpack) }
81
156
  end
82
157
  module_function :update
83
158
 
84
159
  update unless File.exists?(DATABASEFILE)
85
- DATABASE = YAML.load_file(DATABASEFILE)
160
+ DATABASE = File.open(DATABASEFILE, 'rb'){|f| MessagePack.unpack(f.read)}
86
161
  end
87
162
 
88
163
  class String
89
164
  def asciidammit
90
- self.split(//).collect{|c| AsciiDammit::DATABASE[c[0].ord].to_s}.join('')
165
+ self.split(//).collect{|c| a = AsciiDammit::DATABASE[c]; raise "Unhandled char #{c.inspect} (#{a.inspect})" unless a; a.to_s}.join('')
91
166
  end
92
167
  end