unicode 0.4.4-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 314559dfe96e14c8d0f640f1aa8498fb76fd1b23
4
+ data.tar.gz: 972889e38f2b9641c5d926b33ee5bd49a3b0df19
5
+ SHA512:
6
+ metadata.gz: 11aefd96309397148db1cfa440e520775bbc75696eb040375e7ce994781294768f6ed594123f2f131ca90bc215670681472d8aff05e54b8743932f45607160f0
7
+ data.tar.gz: db4c1e76b612de731e5868fd737bf53e7bf9a55ce12430e47e08f3bc94c8320363d066b1235a4178cd0406aabebe70edccd600198a4582d2c1d73a96951647ec
data/README ADDED
@@ -0,0 +1,156 @@
1
+ Unicode Library for Ruby
2
+ Version 0.4.4
3
+
4
+ Yoshida Masato
5
+
6
+
7
+ - Introduction
8
+
9
+ Unicode string manipulation library for Ruby.
10
+ This library is based on UAX #15 Unicode Normalization Forms(*1).
11
+
12
+ *1 <URL:http://www.unicode.org/unicode/reports/tr15/>
13
+
14
+
15
+ - Install
16
+
17
+ This can work with ruby-1.8.7 or later. I recommend you to
18
+ use ruby-1.9.3 or later.
19
+
20
+ Make and install usually.
21
+ For example, when Ruby supports dynamic linking on your OS,
22
+
23
+ ruby extconf.rb
24
+ make
25
+ make install
26
+
27
+ To install using gem, for exapmle:
28
+
29
+ gem build unicdoe.gemspac
30
+ gem install unicode
31
+
32
+
33
+ - Usage
34
+
35
+ If you do not link this module with Ruby statically,
36
+
37
+ require "unicode"
38
+
39
+ before using.
40
+
41
+
42
+ - Module Functions
43
+
44
+ All parameters of functions must be UTF-8 strings.
45
+
46
+ Unicode::strcmp(str1, str2)
47
+ Unicode::strcmp_compat(str1, str2)
48
+ Compare Unicode strings with a normalization.
49
+ strcmp uses the Normalization Form D, strcmp_compat uses
50
+ Normalization Form KD.
51
+
52
+ Unicode::decompose(str)
53
+ Unicode::decompose_compat(str)
54
+ Decompose Unicode string. Then the trailing characters
55
+ are sorted in canonical order.
56
+ decompose uses the canonical decomposition,
57
+ decompose_compat uses the compatibility decomposition.
58
+ The decomposition is based on the character decomposition
59
+ mapping in UnicodeData.txt and the Hangul decomposition
60
+ algorithm.
61
+
62
+ Unicode::decompose_safe(str)
63
+ Decompose Unicode string with a non-standard mapping.
64
+ It does not decompose the characters in
65
+ CompositionExclusions.txt.
66
+
67
+ Unicode::compose(str)
68
+ Compose Unicode string. Before composing, the trailing
69
+ characters are sorted in canonical order.
70
+ The parameter must be decomposed.
71
+ The composition is based on the reverse of the
72
+ character decomposition mapping in UnicodeData.txt,
73
+ CompositionExclusions.txt and the Hangul composition
74
+ algorithm.
75
+
76
+ Unicode::normalize_D(str) (Unicode::nfd(str))
77
+ Unicode::normalize_KD(str) (Unicode::nfkd(str))
78
+ Normalize Unicode string in form D or form KD.
79
+ These are aliases of decompose/decompose_compat.
80
+
81
+ Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
82
+ This is an alias of decompose_safe.
83
+
84
+ Unicode::normalize_C(str) (Unicode::nfc(str))
85
+ Unicode::normalize_KC(str) (Unicode::nfkc(str))
86
+ Normalize Unicode string in form C or form KC.
87
+ normalize_C = decompose + compose
88
+ normalize_KC = decompose_compat + compose
89
+
90
+ Unicode::normalize_C_safe(str) (Unicode::nfc_safe(str))
91
+ Normalize Unicode string with decompose_safe.
92
+ normalize_C_safe = decompose_safe + compose
93
+
94
+ Unicode::upcase(str)
95
+ Unicode::downcase(str)
96
+ Unicode::capitalize(str)
97
+ Case conversion functions.
98
+ The mappings that are used by these functions are not normative
99
+ in UnicodeData.txt.
100
+
101
+ Unicode::categories(str)
102
+ Unicode::abbr_categories(str)
103
+ Get an array of general category names of the string.
104
+ get_abbr_categories returns abbreviated names.
105
+ These can be called with a block.
106
+
107
+ Unicode.get_category do |category| p category end
108
+
109
+ Unicode::text_elements(str)
110
+ Get an array of text elements.
111
+ A text element is a unit that is displayed as a single character.
112
+ These can be called with a block.
113
+
114
+ Unicode::width(str[, cjk])
115
+ Estimate the display width on the fixed pitch text terminal.
116
+ It based on Markus Kuhn's mk_wcwidth.
117
+ If the optional argument 'cjk' is true, East Asian
118
+ Ambiguous characters are treated as wide characters.
119
+
120
+ Unicode.width("\u03b1") #=> 1
121
+ Unicode.width("\u03b1", true) #=> 2
122
+
123
+
124
+ - Bugs
125
+
126
+ UAX #15 suggests that the look up for Normalization Form C
127
+ should not be implemented with a hash of string for better
128
+ performance.
129
+
130
+
131
+ - Copying
132
+
133
+ This extension module is copyrighted free software by
134
+ Yoshida Masato.
135
+
136
+ You can redistribute it and/or modify it under the same
137
+ term as Ruby.
138
+
139
+
140
+ - Author
141
+
142
+ Yoshida Masato <yoshidam@yoshidam.net>
143
+
144
+
145
+ - History
146
+
147
+ Feb 7, 2013 version 0.4.4 update unidata.map for Unicode 6.2
148
+ Aug 8, 2012 version 0.4.3 add categories, text_elements and width
149
+ Feb 29, 2012 version 0.4.2 add decompose_safe
150
+ Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
151
+ Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
152
+ Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
153
+ Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
154
+ Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
155
+ Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
156
+ Nov 23, 1999 version 0.1
data/Rakefile ADDED
@@ -0,0 +1,117 @@
1
+ require "rake/clean"
2
+ require "rake/extensiontask"
3
+ require "rubygems/package_task"
4
+
5
+ CLEAN << "pkg" << "tmp" << "lib/unicode"
6
+
7
+ UPSTREAM_URL = 'http://www.yoshidam.net/unicode-%s.tar.gz'
8
+
9
+ gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
10
+
11
+ gem_task = Gem::PackageTask.new(gem_spec) {|pkg|}
12
+
13
+ Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
14
+ ext.cross_compile = true
15
+ ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
16
+ ext.ext_dir = "ext/unicode"
17
+ ext.lib_dir = "lib/unicode"
18
+ end
19
+
20
+ namespace :gem do
21
+
22
+ desc 'Build all gem files'
23
+ task :all => %w[clean gem gem:java gem:windows]
24
+
25
+ java_gem_spec = gem_spec.dup
26
+ java_gem_spec.platform = 'java'
27
+ java_gem_spec.extensions.clear
28
+ java_gem_spec.files.delete_if { |f| f.start_with?('ext/') }
29
+
30
+ directory java_gem_dir = gem_task.package_dir
31
+
32
+ java_gem_file = File.basename(java_gem_spec.cache_file)
33
+ java_gem_path = File.join(java_gem_dir, java_gem_file)
34
+
35
+ desc "Build the gem file #{java_gem_file}"
36
+ task :java => java_gem_path
37
+
38
+ file java_gem_path => [java_gem_dir] + java_gem_spec.files do
39
+ lib_file = 'lib/unicode.rb'
40
+ tmp_file = "#{lib_file}.tmp-#{$$}"
41
+
42
+ begin
43
+ mv lib_file, tmp_file
44
+
45
+ File.write(lib_file, <<-EOT)
46
+ module Unicode
47
+
48
+ extend self
49
+
50
+ def upcase(str)
51
+ str.to_java.to_upper_case
52
+ end
53
+
54
+ def downcase(str)
55
+ str.to_java.to_lower_case
56
+ end
57
+
58
+ end
59
+ EOT
60
+
61
+ Gem::Package.build(java_gem_spec)
62
+
63
+ mv java_gem_file, java_gem_dir
64
+ ensure
65
+ mv tmp_file, lib_file if File.exist?(tmp_file)
66
+ end
67
+ end
68
+
69
+ desc "Build native gems for Windows"
70
+ task :windows do
71
+ ENV["RUBY_CC_VERSION"] = "1.8.7:1.9.3"
72
+ sh "rake cross compile"
73
+ sh "rake cross native gem"
74
+ end
75
+
76
+ end
77
+
78
+ desc "Update from upstream"
79
+ task :update, [:version] do |t, args|
80
+ require 'zlib'
81
+ require 'open-uri'
82
+ require 'archive/tar/minitar'
83
+
84
+ unless version = args.version || ENV['UPSTREAM_VERSION']
85
+ abort "Please specify UPSTREAM_VERSION. See #{gem_spec.homepage}."
86
+ end
87
+
88
+ io = begin
89
+ open(url = UPSTREAM_URL % version)
90
+ rescue OpenURI::HTTPError
91
+ abort "Upstream version not found: #{url}. See #{gem_spec.homepage}."
92
+ end
93
+
94
+ Archive::Tar::Minitar.open(Zlib::GzipReader.new(io)) { |tar|
95
+ basedir = File.expand_path('..', __FILE__)
96
+
97
+ extract = lambda { |entry, name, dir|
98
+ puts "Extracting `#{name}' to `#{dir || '.'}'..."
99
+ tar.extract_entry(dir ? File.join(basedir, dir) : basedir, entry)
100
+ }
101
+
102
+ tar.each { |entry|
103
+ entry.name.sub!(/\Aunicode\//, '')
104
+
105
+ case name = entry.full_name
106
+ when /\Atools\/|\.gemspec\z/, 'README'
107
+ extract[entry, name, nil]
108
+ when /\.(?:[ch]|map)\z/, 'extconf.rb'
109
+ extract[entry, name, 'ext/unicode']
110
+ when /\Atest/
111
+ extract[entry, name, 'test']
112
+ else
113
+ puts "Skipping `#{name}'..."
114
+ end
115
+ }
116
+ }
117
+ end
data/lib/unicode.rb ADDED
@@ -0,0 +1,13 @@
1
+ module Unicode
2
+
3
+ extend self
4
+
5
+ def upcase(str)
6
+ str.to_java.to_upper_case
7
+ end
8
+
9
+ def downcase(str)
10
+ str.to_java.to_lower_case
11
+ end
12
+
13
+ end
data/test/test.rb ADDED
@@ -0,0 +1,69 @@
1
+ #! /usr/local/bin/ruby -KU
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'unicode'
5
+
6
+ ## dump Unicode string
7
+ class String
8
+ def udump
9
+ ustr = self.unpack("U*")
10
+ ret = []
11
+ ustr.each do |e|
12
+ if e.is_a?(Integer)
13
+ ret << "U+%04X" % e
14
+ else
15
+ ret << e
16
+ end
17
+ end
18
+ ret
19
+ end
20
+ end
21
+
22
+
23
+ print "Canonical decomposition vs compatibility decomposition\n"
24
+ p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
25
+ p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
26
+
27
+ print "Canonical equivalent vs Compatibility equivalent\n"
28
+ p Unicode::strcmp("ガ", "ガ")
29
+ p Unicode::strcmp("ガ", "ガ")
30
+ p Unicode::strcmp_compat("ガ", "ガ")
31
+
32
+ print "Decomposition/composition\n"
33
+ p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
34
+ p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
35
+ p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
36
+ p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
37
+ p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
38
+ p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
39
+ p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
40
+ p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
41
+
42
+ print "Kana Normalization\n"
43
+ p Unicode::normalize_D("ガガ").udump
44
+ p Unicode::normalize_C("ガガ").udump
45
+ p Unicode::normalize_KD("ガガ").udump
46
+ p Unicode::normalize_KC("ガガ").udump
47
+
48
+ print "Hangul\n"
49
+ p "요시담".udump
50
+ p Unicode::normalize_D("요시담").udump
51
+ p Unicode::normalize_C("요시담").udump
52
+
53
+ print "Composition Exclusion\n"
54
+ print " ANGSTROM SIGN [U+212B]\n"
55
+ p Unicode::normalize_D([0x212b].pack("U")).udump
56
+ p Unicode::normalize_C([0x212b].pack("U")).udump
57
+ print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
58
+ p Unicode::normalize_D([0x00c5].pack("U")).udump
59
+ p Unicode::normalize_C([0x00c5].pack("U")).udump
60
+
61
+ print "Case conversion\n"
62
+ p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
63
+ p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
64
+ p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
65
+
66
+
67
+ ## Local variables:
68
+ ## coding: utf-8
69
+ ## End:
data/tools/README ADDED
@@ -0,0 +1,7 @@
1
+ The bundled unidata.map is created from UnicodeData.txt,
2
+ DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
3
+ of Unicode 6.0.
4
+
5
+ To update unidata.map,
6
+
7
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
@@ -0,0 +1,293 @@
1
+ #! /usr/local/bin/ruby -KU
2
+
3
+ #if $KCODE != 'UTF8'
4
+ # raise "$KCODE must be UTF8"
5
+ #end
6
+
7
+ HEAD=<<EOS
8
+ /*
9
+ * UnicodeData
10
+ * Copyright 1999, 2004, 2010, 2012 by yoshidam
11
+ *
12
+ */
13
+
14
+ #ifndef _UNIDATA_MAP
15
+ #define _UNIDATA_MAP
16
+
17
+ EOS
18
+
19
+ HEAD1=<<EOS
20
+
21
+ enum GeneralCategory {
22
+ /* Letter */
23
+ c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
24
+ /* Mark */
25
+ c_Mn, c_Mc, c_Me,
26
+ /* Number */
27
+ c_Nd, c_Nl, c_No,
28
+ /* Punctuation */
29
+ c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
30
+ /* Symbol */
31
+ c_Sm, c_Sc, c_Sk, c_So,
32
+ /* Separator */
33
+ c_Zs, c_Zl, c_Zp,
34
+ /* Other */
35
+ c_Cc, c_Cf, c_Cs, c_Co, c_Cn
36
+ };
37
+
38
+ const char* const gencat_abbr[] = {
39
+ "", /* 0 */
40
+ /* Letter */
41
+ "Lu", "Ll", "Lt", "LC", "Lm", "Lo",
42
+ /* Mark */
43
+ "Mn", "Mc", "Me",
44
+ /* Number */
45
+ "Nd", "Nl", "No",
46
+ /* Punctuation */
47
+ "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
48
+ /* Symbol */
49
+ "Sm", "Sc", "Sk", "So",
50
+ /* Separator */
51
+ "Zs", "Zl", "Zp",
52
+ /* Other */
53
+ "Cc", "Cf", "Cs", "Co", "Cn"
54
+ };
55
+
56
+ const char* const gencat_long[] = {
57
+ "",
58
+ "Uppercase_Letter",
59
+ "Lowercase_Letter",
60
+ "Titlecase_Letter",
61
+ "Cased_Letter",
62
+ "Modifier_Letter",
63
+ "Other_Letter",
64
+ "Nonspacing_Mark",
65
+ "Spacing_Mark",
66
+ "Enclosing_Mark",
67
+ "Decimal_Number",
68
+ "Letter_Number",
69
+ "Other_Number",
70
+ "Connector_Punctuation",
71
+ "Dash_Punctuation",
72
+ "Open_Punctuation",
73
+ "Close_Punctuation",
74
+ "Initial_Punctuation",
75
+ "Final_Punctuation",
76
+ "Other_Punctuation",
77
+ "Math_Symbol",
78
+ "Currency_Symbol",
79
+ "Modifier_Symbol",
80
+ "Other_Symbol",
81
+ "Space_Separator",
82
+ "Line_Separator",
83
+ "Paragraph_Separator",
84
+ "Control",
85
+ "Format",
86
+ "Surrogate",
87
+ "Private_Use",
88
+ "Unassigned"
89
+ };
90
+
91
+ enum EastAsianWidth {
92
+ w_N = 1, w_A, w_H, w_W, w_F, w_Na
93
+ };
94
+
95
+ struct unicode_data {
96
+ const int code;
97
+ const char* const canon;
98
+ const char* const compat;
99
+ const char* const uppercase;
100
+ const char* const lowercase;
101
+ const char* const titlecase;
102
+ const unsigned char combining_class;
103
+ const unsigned char exclusion;
104
+ const unsigned char general_category;
105
+ const unsigned char east_asian_width;
106
+ };
107
+
108
+ static const struct unicode_data unidata[] = {
109
+ EOS
110
+
111
+ TAIL=<<EOS
112
+ };
113
+
114
+ #endif
115
+ EOS
116
+
117
+ def hex2str(hex)
118
+ if hex.nil? || hex == ''
119
+ return [nil, nil]
120
+ end
121
+ canon = ""
122
+ compat = ""
123
+ chars = hex.split(" ")
124
+ if chars[0] =~ /^[0-9A-F]{4,6}$/
125
+ chars.each do |c|
126
+ canon << [c.hex].pack("U")
127
+ end
128
+ compat = canon
129
+ elsif chars[0] =~ /^<.+>$/
130
+ chars.shift
131
+ chars.each do |c|
132
+ compat << [c.hex].pack("U")
133
+ end
134
+ canon = nil
135
+ else
136
+ raise "unknown value: " + hex
137
+ end
138
+ [canon, compat]
139
+ end
140
+
141
+ def hex_or_nil(str)
142
+ return nil if str.nil? || str == ''
143
+ ret = ""
144
+ chars = str.split(" ")
145
+ chars.each do |c|
146
+ ret << [c.hex].pack("U")
147
+ end
148
+ return ret
149
+ end
150
+
151
+ def printstr(str)
152
+ return "NULL" if !str
153
+ ret = ""
154
+ str.each_byte do |c|
155
+ if c >= 32 && c < 127 && c != 34 && c != 92
156
+ ret << c
157
+ else
158
+ ret << format("\\%03o", c)
159
+ end
160
+ end
161
+ return '"' + ret + '"'
162
+ end
163
+
164
+ if ARGV.length != 4
165
+ puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
166
+ exit 0
167
+ end
168
+
169
+ ## scan Composition Exclusions
170
+ exclusion = {}
171
+ open(ARGV[1]) do |f|
172
+ while l = f.gets
173
+ next if l =~ /^\#/ || l =~ /^$/
174
+ next if l !~ /Full_Composition_Exclusion/
175
+ code, = l.split(/\s/)
176
+ if code =~ /^[0-9A-F]+$/
177
+ code = code.hex
178
+ exclusion[code] = true
179
+ elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
180
+ # p [$1, $2]
181
+ scode = $1.hex
182
+ ecode = $2.hex
183
+ for code in scode..ecode
184
+ exclusion[code] = true
185
+ end
186
+ end
187
+ end
188
+ end
189
+
190
+ ## scan Special Casing
191
+ casing = {}
192
+ open(ARGV[2]) do |f|
193
+ while l = f.gets
194
+ l.chomp!
195
+ next if l =~ /^\#/ || l =~ /^$/
196
+ l =~ /^(.*)#\s*(.*)$/
197
+ l = $1
198
+ comment = $2
199
+ code,lower,title,upper,cond = l.split(/;\s/)
200
+ next if cond
201
+ lower = nil if code == lower
202
+ title = nil if code == title
203
+ upper = nil if code == upper
204
+ code = code.hex
205
+ casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
206
+ end
207
+ end
208
+
209
+ ## scan UnicodeData
210
+ udata = {}
211
+ range_data = []
212
+ open(ARGV[0]) do |f|
213
+ while l = f.gets
214
+ l.chomp!
215
+ code, charname, gencat, ccclass, bidicat,decomp,
216
+ dec, digit, num, mirror, uni1_0, comment, upcase,
217
+ lowcase, titlecase = l.split(";", 15);
218
+ code = code.hex
219
+ ccclass = ccclass.to_i
220
+ canon, compat = hex2str(decomp)
221
+ upcase = hex_or_nil(upcase)
222
+ lowcase = hex_or_nil(lowcase)
223
+ titlecase = hex_or_nil(titlecase)
224
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
225
+ if charname =~ /^<(.*, (First|Last))>$/
226
+ charname = $1.upcase.gsub(/,? /, '_')
227
+ range_data << [charname, code]
228
+ end
229
+ end
230
+ end
231
+
232
+ ## scan EastAsianWidth
233
+ ea_width = {}
234
+ open(ARGV[3]) do |f|
235
+ while l = f.gets
236
+ l.chomp!
237
+ next if l =~ /^\#/ || l =~ /^$/
238
+ l =~ /^(.*)\s+#\s*(.*)$/
239
+ l = $1
240
+ comment = $2
241
+ code,width = l.split(/;/)
242
+ if code =~ /\.\./
243
+ start_code, end_code = code.split('..')
244
+ start_code = start_code.hex
245
+ end_code = end_code.hex
246
+ (start_code..end_code).each do |code|
247
+ ea_width[code] = width
248
+ end
249
+ next
250
+ end
251
+ code = code.hex
252
+ ea_width[code] = width
253
+ end
254
+ end
255
+
256
+ print HEAD
257
+ range_data.each do |charname, code|
258
+ printf("#define %s\t(0x%04x)\n", charname, code)
259
+ end
260
+
261
+ print HEAD1
262
+ udata.sort.each do |code, data|
263
+ ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
264
+ ## Exclusions
265
+ ex = 0
266
+ if exclusion[code] ## Script-specifics or Post Composition Version
267
+ ex = 1
268
+ elsif canon =~ /^.$/ ## Singltons
269
+ ex = 2
270
+ elsif !canon.nil?
271
+ starter = canon.unpack("U*")[0]
272
+ if udata[starter][0] != 0 ## Non-stater decompositions
273
+ ex = 3
274
+ end
275
+ end
276
+ ## Special Casing
277
+ if casing[code]
278
+ lowcase = casing[code][0] if casing[code][0]
279
+ titlecase = casing[code][1] if casing[code][1]
280
+ upcase = casing[code][2] if casing[code][2]
281
+ end
282
+ width = 'N'
283
+ if ea_width[code]
284
+ width = ea_width[code]
285
+ end
286
+
287
+ printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
288
+ code, printstr(canon),
289
+ printstr(compat), printstr(upcase), printstr(lowcase),
290
+ printstr(titlecase), ccclass, ex, gencat, width)
291
+ end
292
+ printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
293
+ print TAIL
data/tools/normtest.rb ADDED
@@ -0,0 +1,111 @@
1
+ #! /usr/local/bin/ruby -KU
2
+
3
+ ## Conformance test with NormaliztionTest.txt
4
+ ## Copyrigth 2010 yoshidam
5
+
6
+ require 'unicode'
7
+
8
+ TESTFILE = "NormalizationTest.txt"
9
+
10
+ def from_hex(str)
11
+ ret = ""
12
+ chars = str.split(" ")
13
+ chars.each do |c|
14
+ ret << [c.hex].pack("U")
15
+ end
16
+ return ret
17
+ end
18
+
19
+ def to_hex(str)
20
+ ret = ""
21
+ str = str.unpack('U*')
22
+ str.each do |c|
23
+ ret += sprintf("%04X ", c)
24
+ end
25
+ ret
26
+ end
27
+
28
+ open(TESTFILE) do |f|
29
+ while l = f.gets
30
+ next if l =~ /^#/
31
+ l.chomp
32
+ if l =~ /^@/
33
+ puts l
34
+ next
35
+ end
36
+ c1, c2, c3, c4, c5 = l.split(';')
37
+ code = c1
38
+ c1 = from_hex(c1)
39
+ c2 = from_hex(c2)
40
+ c3 = from_hex(c3)
41
+ c4 = from_hex(c4)
42
+ c5 = from_hex(c5)
43
+ ## NFC TEST
44
+ if c2 == Unicode.nfc(c1) && c2 == Unicode.nfc(c2) &&
45
+ c2 == Unicode.nfc(c3) &&
46
+ c4 == Unicode.nfc(c4) && c4 == Unicode.nfc(c4)
47
+ ##puts "NFC OK: " + code
48
+ else
49
+ puts "NFC NG: " + to_hex(c1)
50
+ printf(" c2=%s NFC(c1)=%s NFC(c2)=%s NFC(c3)=%s\n",
51
+ to_hex(c2),
52
+ to_hex(Unicode.nfc(c1)),
53
+ to_hex(Unicode.nfc(c2)),
54
+ to_hex(Unicode.nfc(c3)))
55
+ printf(" c4=%s NFC(c4)=%s NFC(c5)=%s\n",
56
+ to_hex(c4),
57
+ to_hex(Unicode.nfc(c4)),
58
+ to_hex(Unicode.nfc(c5)))
59
+ end
60
+
61
+ ## NFD TEST
62
+ if c3 == Unicode.nfd(c1) && c3 == Unicode.nfd(c2) &&
63
+ c3 == Unicode.nfd(c3) &&
64
+ c5 == Unicode.nfd(c4) && c5 == Unicode.nfd(c5)
65
+ ##puts "NFD OK: " + code
66
+ else
67
+ puts "NFD NG: " + to_hex(c1)
68
+ printf(" c3=%s NFD(c1)=%s NFD(c2)=%s NFD(c3)=%s\n",
69
+ to_hex(c3),
70
+ to_hex(Unicode.nfd(c1)),
71
+ to_hex(Unicode.nfd(c2)),
72
+ to_hex(Unicode.nfd(c3)))
73
+ printf(" c5=%s NFD(c4)=%s NFD(c5)=%s\n",
74
+ to_hex(c5),
75
+ to_hex(Unicode.nfd(c4)),
76
+ to_hex(Unicode.nfd(c5)))
77
+ end
78
+
79
+ ## NFKC TEST
80
+ if c4 == Unicode.nfkc(c1) && c4 == Unicode.nfkc(c2) &&
81
+ c4 == Unicode.nfkc(c3) &&
82
+ c4 == Unicode.nfkc(c4) && c4 == Unicode.nfkc(c5)
83
+ ##puts "NFKC OK: " + code
84
+ else
85
+ puts "NFKC NG: " + to_hex(c1)
86
+ printf(" c4=%s NFKC(c1)=%s NFKC(c2)=%s NFKC(c3)=%s NFKC(c4)=%s NFKC(c5)=%s\n",
87
+ to_hex(c4),
88
+ to_hex(Unicode.nfkc(c1)),
89
+ to_hex(Unicode.nfkc(c2)),
90
+ to_hex(Unicode.nfkc(c3)),
91
+ to_hex(Unicode.nfkc(c4)),
92
+ to_hex(Unicode.nfkc(c5)))
93
+ end
94
+
95
+ ## NFKD TEST
96
+ if c5 == Unicode.nfkd(c1) && c5 == Unicode.nfkd(c2) &&
97
+ c5 == Unicode.nfkd(c3) &&
98
+ c5 == Unicode.nfkd(c4) && c5 == Unicode.nfkd(c5)
99
+ ##puts "NFKD OK: " + code
100
+ else
101
+ puts "NFKD NG: " + to_hex(c1)
102
+ printf(" c5=%s NFKD(c1)=%s NFKD(c2)=%s NFKD(c3)=%s NFKD(c4)=%s NFKD(c5)=%s\n",
103
+ to_hex(c5),
104
+ to_hex(Unicode.nfkd(c1)),
105
+ to_hex(Unicode.nfkd(c2)),
106
+ to_hex(Unicode.nfkd(c3)),
107
+ to_hex(Unicode.nfkd(c4)),
108
+ to_hex(Unicode.nfkd(c5)))
109
+ end
110
+ end
111
+ end
data/unicode.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{unicode}
5
+ s.version = "0.4.4"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = [%q{Yoshida Masato}]
9
+ s.date = %q{2013-02-07}
10
+ s.email = %q{yoshidam@yoshidam.net}
11
+ s.licenses = %w[Ruby]
12
+ s.extensions = %w[ext/unicode/extconf.rb]
13
+ s.extra_rdoc_files = [%q{README}]
14
+ s.files = %w[
15
+ README Rakefile unicode.gemspec lib/unicode.rb
16
+ test/test.rb tools/README tools/mkunidata.rb tools/normtest.rb
17
+ ext/unicode/extconf.rb ext/unicode/unicode.c ext/unicode/unidata.map
18
+ ext/unicode/ustring.c ext/unicode/ustring.h ext/unicode/wstring.c ext/unicode/wstring.h
19
+ ]
20
+ s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
21
+ s.require_paths = [%q{lib}]
22
+ s.rubygems_version = %q{1.8.6}
23
+ s.summary = %q{Unicode normalization library.}
24
+ s.description = %q{Unicode normalization library.}
25
+
26
+ if s.respond_to? :specification_version then
27
+ s.specification_version = 3
28
+ end
29
+ end
30
+
metadata ADDED
@@ -0,0 +1,52 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.4
5
+ platform: java
6
+ authors:
7
+ - Yoshida Masato
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-02-07 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Unicode normalization library.
14
+ email: yoshidam@yoshidam.net
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files:
18
+ - README
19
+ files:
20
+ - README
21
+ - Rakefile
22
+ - lib/unicode.rb
23
+ - test/test.rb
24
+ - tools/README
25
+ - tools/mkunidata.rb
26
+ - tools/normtest.rb
27
+ - unicode.gemspec
28
+ homepage: http://www.yoshidam.net/Ruby.html#unicode
29
+ licenses:
30
+ - Ruby
31
+ metadata: {}
32
+ post_install_message:
33
+ rdoc_options: []
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubyforge_project:
48
+ rubygems_version: 2.4.4
49
+ signing_key:
50
+ specification_version: 3
51
+ summary: Unicode normalization library.
52
+ test_files: []