unicode 0.3.1-x86-mingw32

Sign up to get free protection for your applications and to get access to all the features.
data/lib/unicode.rb ADDED
@@ -0,0 +1,6 @@
1
+ begin
2
+ RUBY_VERSION =~ /(\d+.\d+)/
3
+ require "unicode/#{$1}/unicode_native"
4
+ rescue LoadError
5
+ require 'unicode/unicode_native'
6
+ end
Binary file
data/test/test.rb ADDED
@@ -0,0 +1,69 @@
1
+ #! /usr/local/bin/ruby -KU
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'unicode'
5
+
6
+ ## dump Unicode string
7
+ class String
8
+ def udump
9
+ ustr = self.unpack("U*")
10
+ ret = []
11
+ ustr.each do |e|
12
+ if e.is_a?(Integer)
13
+ ret << "U+%04X" % e
14
+ else
15
+ ret << e
16
+ end
17
+ end
18
+ ret
19
+ end
20
+ end
21
+
22
+
23
+ print "Canonical decomposition vs compatibility decomposition\n"
24
+ p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
25
+ p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
26
+
27
+ print "Canonical equivalent vs Compatibility equivalent\n"
28
+ p Unicode::strcmp("ガ", "ガ")
29
+ p Unicode::strcmp("ガ", "ガ")
30
+ p Unicode::strcmp_compat("ガ", "ガ")
31
+
32
+ print "Decomposition/composition\n"
33
+ p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
34
+ p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
35
+ p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
36
+ p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
37
+ p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
38
+ p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
39
+ p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
40
+ p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
41
+
42
+ print "Kana Normalization\n"
43
+ p Unicode::normalize_D("ガガ").udump
44
+ p Unicode::normalize_C("ガガ").udump
45
+ p Unicode::normalize_KD("ガガ").udump
46
+ p Unicode::normalize_KC("ガガ").udump
47
+
48
+ print "Hangul\n"
49
+ p "요시담".udump
50
+ p Unicode::normalize_D("요시담").udump
51
+ p Unicode::normalize_C("요시담").udump
52
+
53
+ print "Composition Exclusion\n"
54
+ print " ANGSTROM SIGN [U+212B]\n"
55
+ p Unicode::normalize_D([0x212b].pack("U")).udump
56
+ p Unicode::normalize_C([0x212b].pack("U")).udump
57
+ print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
58
+ p Unicode::normalize_D([0x00c5].pack("U")).udump
59
+ p Unicode::normalize_C([0x00c5].pack("U")).udump
60
+
61
+ print "Case conversion\n"
62
+ p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
63
+ p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
64
+ p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
65
+
66
+
67
+ ## Local variables:
68
+ ## coding: utf-8
69
+ ## End:
data/tools/README ADDED
@@ -0,0 +1,6 @@
1
+ The unidata.map is created from UnicodeData.txt,
2
+ DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 5.2
3
+
4
+ To update unidata.map,
5
+
6
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
@@ -0,0 +1,169 @@
1
+ #! /usr/local/bin/ruby -KU
2
+
3
+ #if $KCODE != 'UTF8'
4
+ # raise "$KCODE must be UTF8"
5
+ #end
6
+
7
+ HEAD=<<EOS
8
+ /*
9
+ * UnicodeData
10
+ * Copyright 1999, 2004, 2010 by yoshidam
11
+ *
12
+ */
13
+
14
+ #ifndef _UNIDATA_MAP
15
+ #define _UNIDATA_MAP
16
+
17
+ struct unicode_data {
18
+ const int code;
19
+ const int combining_class;
20
+ const int exclusion;
21
+ const char* const canon;
22
+ const char* const compat;
23
+ const char* uppercase;
24
+ const char* lowercase;
25
+ const char* titlecase;
26
+ };
27
+
28
+ static const struct unicode_data unidata[] = {
29
+ EOS
30
+
31
+ TAIL=<<EOS
32
+ };
33
+
34
+ #endif
35
+ EOS
36
+
37
+ def hex2str(hex)
38
+ if hex.nil? || hex == ''
39
+ return [nil, nil]
40
+ end
41
+ canon = ""
42
+ compat = ""
43
+ chars = hex.split(" ")
44
+ if chars[0] =~ /^[0-9A-F]{4,6}$/
45
+ chars.each do |c|
46
+ canon << [c.hex].pack("U")
47
+ end
48
+ compat = canon
49
+ elsif chars[0] =~ /^<.+>$/
50
+ chars.shift
51
+ chars.each do |c|
52
+ compat << [c.hex].pack("U")
53
+ end
54
+ canon = nil
55
+ else
56
+ raise "unknown value: " + hex
57
+ end
58
+ [canon, compat]
59
+ end
60
+
61
+ def hex_or_nil(str)
62
+ return nil if str.nil? || str == ''
63
+ ret = ""
64
+ chars = str.split(" ")
65
+ chars.each do |c|
66
+ ret << [c.hex].pack("U")
67
+ end
68
+ return ret
69
+ end
70
+
71
+ def printstr(str)
72
+ return "NULL" if !str
73
+ ret = ""
74
+ str.each_byte do |c|
75
+ if c >= 32 && c < 127 && c != 34 && c != 92
76
+ ret << c
77
+ else
78
+ ret << format("\\%03o", c)
79
+ end
80
+ end
81
+ return '"' + ret + '"'
82
+ end
83
+
84
+ ## scan Composition Exclusions
85
+ exclusion = {}
86
+ open(ARGV[1]) do |f|
87
+ while l = f.gets
88
+ next if l =~ /^\#/ || l =~ /^$/
89
+ next if l !~ /Full_Composition_Exclusion/
90
+ code, = l.split(/\s/)
91
+ if code =~ /^[0-9A-F]+$/
92
+ code = code.hex
93
+ exclusion[code] = true
94
+ elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
95
+ # p [$1, $2]
96
+ scode = $1.hex
97
+ ecode = $2.hex
98
+ for code in scode..ecode
99
+ exclusion[code] = true
100
+ end
101
+ end
102
+ end
103
+ end
104
+
105
+ ## scan Special Casing
106
+ casing = {}
107
+ open(ARGV[2]) do |f|
108
+ while l = f.gets
109
+ l.chomp!
110
+ next if l =~ /^\#/ || l =~ /^$/
111
+ l =~ /^(.*)#\s*(.*)$/
112
+ l = $1
113
+ comment = $2
114
+ code,lower,title,upper,cond = l.split(/;\s/)
115
+ next if cond
116
+ lower = nil if code == lower
117
+ title = nil if code == title
118
+ upper = nil if code == upper
119
+ code = code.hex
120
+ casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
121
+ end
122
+ end
123
+
124
+ ## scan UnicodeData
125
+ udata = {}
126
+ open(ARGV[0]) do |f|
127
+ while l = f.gets
128
+ l.chomp!
129
+ code, charname, gencat, ccclass, bidicat,decomp,
130
+ dec, digit, num, mirror, uni1_0, comment, upcase,
131
+ lowcase, titlecase = l.split(";", 15);
132
+ code = code.hex
133
+ ccclass = ccclass.to_i
134
+ canon, compat = hex2str(decomp)
135
+ upcase = hex_or_nil(upcase)
136
+ lowcase = hex_or_nil(lowcase)
137
+ titlecase = hex_or_nil(titlecase)
138
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
139
+ end
140
+ end
141
+
142
+ print HEAD
143
+ udata.sort.each do |code, data|
144
+ ccclass, canon, compat, upcase, lowcase, titlecase = data
145
+ ## Exclusions
146
+ ex = 0
147
+ if exclusion[code] ## Script-specifics or Post Composition Version
148
+ ex = 1
149
+ elsif canon =~ /^.$/ ## Singltons
150
+ ex = 2
151
+ elsif !canon.nil?
152
+ starter = canon.unpack("U*")[0]
153
+ if udata[starter][0] != 0 ## Non-stater decompositions
154
+ ex = 3
155
+ end
156
+ end
157
+ ## Special Casing
158
+ if casing[code]
159
+ lowcase = casing[code][0] if casing[code][0]
160
+ titlecase = casing[code][1] if casing[code][1]
161
+ upcase = casing[code][2] if casing[code][2]
162
+ end
163
+ printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
164
+ code, ccclass, ex, printstr(canon),
165
+ printstr(compat), printstr(upcase), printstr(lowcase),
166
+ printstr(titlecase))
167
+ end
168
+ printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
169
+ print TAIL
data/unicode.gemspec ADDED
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new { |s|
2
+ s.name = %q{unicode}
3
+ s.version = %q{0.3.1}
4
+ s.date = %q{2010-02-26}
5
+ s.summary = %q{Unicode normalization library.}
6
+ s.require_paths = %w[lib]
7
+ s.author = %q{Yoshida Masato}
8
+ s.email = %q{yoshidam@yoshidam.net}
9
+ s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
10
+ s.files = `git ls-files`.split("\n").reject {|f| f =~ /^\./}
11
+ s.extra_rdoc_files = %w[README]
12
+ s.extensions = %w[ext/unicode/extconf.rb]
13
+ }
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode
3
+ version: !ruby/object:Gem::Version
4
+ hash: 17
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 1
10
+ version: 0.3.1
11
+ platform: x86-mingw32
12
+ authors:
13
+ - Yoshida Masato
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-02-26 00:00:00 -03:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description:
23
+ email: yoshidam@yoshidam.net
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ files:
31
+ - README
32
+ - Rakefile
33
+ - ext/unicode/extconf.rb
34
+ - ext/unicode/unicode.c
35
+ - ext/unicode/unidata.map
36
+ - ext/unicode/ustring.c
37
+ - ext/unicode/ustring.h
38
+ - ext/unicode/wstring.c
39
+ - ext/unicode/wstring.h
40
+ - lib/unicode.rb
41
+ - test/test.rb
42
+ - tools/README
43
+ - tools/mkunidata.rb
44
+ - unicode.gemspec
45
+ - lib/unicode/unicode_native.so
46
+ has_rdoc: true
47
+ homepage: http://www.yoshidam.net/Ruby.html#unicode
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options: []
52
+
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ hash: 3
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ requirements: []
74
+
75
+ rubyforge_project:
76
+ rubygems_version: 1.3.7
77
+ signing_key:
78
+ specification_version: 3
79
+ summary: Unicode normalization library.
80
+ test_files: []
81
+