unicode 0.3.1-x86-mswin32-60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/unicode.rb ADDED
@@ -0,0 +1,6 @@
1
+ begin
2
+ RUBY_VERSION =~ /(\d+.\d+)/
3
+ require "unicode/#{$1}/unicode_native"
4
+ rescue LoadError
5
+ require 'unicode/unicode_native'
6
+ end
Binary file
data/test/test.rb ADDED
@@ -0,0 +1,69 @@
1
+ #! /usr/local/bin/ruby -KU
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'unicode'
5
+
6
+ ## dump Unicode string
7
+ class String
8
+ def udump
9
+ ustr = self.unpack("U*")
10
+ ret = []
11
+ ustr.each do |e|
12
+ if e.is_a?(Integer)
13
+ ret << "U+%04X" % e
14
+ else
15
+ ret << e
16
+ end
17
+ end
18
+ ret
19
+ end
20
+ end
21
+
22
+
23
+ print "Canonical decomposition vs compatibility decomposition\n"
24
+ p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
25
+ p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
26
+
27
+ print "Canonical equivalent vs Compatibility equivalent\n"
28
+ p Unicode::strcmp("ガ", "ガ")
29
+ p Unicode::strcmp("ガ", "ガ")
30
+ p Unicode::strcmp_compat("ガ", "ガ")
31
+
32
+ print "Decomposition/composition\n"
33
+ p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
34
+ p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
35
+ p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
36
+ p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
37
+ p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
38
+ p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
39
+ p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
40
+ p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
41
+
42
+ print "Kana Normalization\n"
43
+ p Unicode::normalize_D("ガガ").udump
44
+ p Unicode::normalize_C("ガガ").udump
45
+ p Unicode::normalize_KD("ガガ").udump
46
+ p Unicode::normalize_KC("ガガ").udump
47
+
48
+ print "Hangul\n"
49
+ p "요시담".udump
50
+ p Unicode::normalize_D("요시담").udump
51
+ p Unicode::normalize_C("요시담").udump
52
+
53
+ print "Composition Exclusion\n"
54
+ print " ANGSTROM SIGN [U+212B]\n"
55
+ p Unicode::normalize_D([0x212b].pack("U")).udump
56
+ p Unicode::normalize_C([0x212b].pack("U")).udump
57
+ print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
58
+ p Unicode::normalize_D([0x00c5].pack("U")).udump
59
+ p Unicode::normalize_C([0x00c5].pack("U")).udump
60
+
61
+ print "Case conversion\n"
62
+ p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
63
+ p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
64
+ p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
65
+
66
+
67
+ ## Local variables:
68
+ ## coding: utf-8
69
+ ## End:
data/tools/README ADDED
@@ -0,0 +1,6 @@
1
+ The unidata.map is created from UnicodeData.txt,
2
+ DerivedNormalizationProps.txt and SpecialCasing.txt of Unicode 5.2
3
+
4
+ To update unidata.map,
5
+
6
+ ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt > unidata.map
@@ -0,0 +1,169 @@
1
+ #! /usr/local/bin/ruby -KU
2
+
3
+ #if $KCODE != 'UTF8'
4
+ # raise "$KCODE must be UTF8"
5
+ #end
6
+
7
+ HEAD=<<EOS
8
+ /*
9
+ * UnicodeData
10
+ * Copyright 1999, 2004, 2010 by yoshidam
11
+ *
12
+ */
13
+
14
+ #ifndef _UNIDATA_MAP
15
+ #define _UNIDATA_MAP
16
+
17
+ struct unicode_data {
18
+ const int code;
19
+ const int combining_class;
20
+ const int exclusion;
21
+ const char* const canon;
22
+ const char* const compat;
23
+ const char* uppercase;
24
+ const char* lowercase;
25
+ const char* titlecase;
26
+ };
27
+
28
+ static const struct unicode_data unidata[] = {
29
+ EOS
30
+
31
+ TAIL=<<EOS
32
+ };
33
+
34
+ #endif
35
+ EOS
36
+
37
+ def hex2str(hex)
38
+ if hex.nil? || hex == ''
39
+ return [nil, nil]
40
+ end
41
+ canon = ""
42
+ compat = ""
43
+ chars = hex.split(" ")
44
+ if chars[0] =~ /^[0-9A-F]{4,6}$/
45
+ chars.each do |c|
46
+ canon << [c.hex].pack("U")
47
+ end
48
+ compat = canon
49
+ elsif chars[0] =~ /^<.+>$/
50
+ chars.shift
51
+ chars.each do |c|
52
+ compat << [c.hex].pack("U")
53
+ end
54
+ canon = nil
55
+ else
56
+ raise "unknown value: " + hex
57
+ end
58
+ [canon, compat]
59
+ end
60
+
61
+ def hex_or_nil(str)
62
+ return nil if str.nil? || str == ''
63
+ ret = ""
64
+ chars = str.split(" ")
65
+ chars.each do |c|
66
+ ret << [c.hex].pack("U")
67
+ end
68
+ return ret
69
+ end
70
+
71
+ def printstr(str)
72
+ return "NULL" if !str
73
+ ret = ""
74
+ str.each_byte do |c|
75
+ if c >= 32 && c < 127 && c != 34 && c != 92
76
+ ret << c
77
+ else
78
+ ret << format("\\%03o", c)
79
+ end
80
+ end
81
+ return '"' + ret + '"'
82
+ end
83
+
84
+ ## scan Composition Exclusions
85
+ exclusion = {}
86
+ open(ARGV[1]) do |f|
87
+ while l = f.gets
88
+ next if l =~ /^\#/ || l =~ /^$/
89
+ next if l !~ /Full_Composition_Exclusion/
90
+ code, = l.split(/\s/)
91
+ if code =~ /^[0-9A-F]+$/
92
+ code = code.hex
93
+ exclusion[code] = true
94
+ elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
95
+ # p [$1, $2]
96
+ scode = $1.hex
97
+ ecode = $2.hex
98
+ for code in scode..ecode
99
+ exclusion[code] = true
100
+ end
101
+ end
102
+ end
103
+ end
104
+
105
+ ## scan Special Casing
106
+ casing = {}
107
+ open(ARGV[2]) do |f|
108
+ while l = f.gets
109
+ l.chomp!
110
+ next if l =~ /^\#/ || l =~ /^$/
111
+ l =~ /^(.*)#\s*(.*)$/
112
+ l = $1
113
+ comment = $2
114
+ code,lower,title,upper,cond = l.split(/;\s/)
115
+ next if cond
116
+ lower = nil if code == lower
117
+ title = nil if code == title
118
+ upper = nil if code == upper
119
+ code = code.hex
120
+ casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
121
+ end
122
+ end
123
+
124
+ ## scan UnicodeData
125
+ udata = {}
126
+ open(ARGV[0]) do |f|
127
+ while l = f.gets
128
+ l.chomp!
129
+ code, charname, gencat, ccclass, bidicat,decomp,
130
+ dec, digit, num, mirror, uni1_0, comment, upcase,
131
+ lowcase, titlecase = l.split(";", 15);
132
+ code = code.hex
133
+ ccclass = ccclass.to_i
134
+ canon, compat = hex2str(decomp)
135
+ upcase = hex_or_nil(upcase)
136
+ lowcase = hex_or_nil(lowcase)
137
+ titlecase = hex_or_nil(titlecase)
138
+ udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
139
+ end
140
+ end
141
+
142
+ print HEAD
143
+ udata.sort.each do |code, data|
144
+ ccclass, canon, compat, upcase, lowcase, titlecase = data
145
+ ## Exclusions
146
+ ex = 0
147
+ if exclusion[code] ## Script-specifics or Post Composition Version
148
+ ex = 1
149
+ elsif canon =~ /^.$/ ## Singltons
150
+ ex = 2
151
+ elsif !canon.nil?
152
+ starter = canon.unpack("U*")[0]
153
+ if udata[starter][0] != 0 ## Non-stater decompositions
154
+ ex = 3
155
+ end
156
+ end
157
+ ## Special Casing
158
+ if casing[code]
159
+ lowcase = casing[code][0] if casing[code][0]
160
+ titlecase = casing[code][1] if casing[code][1]
161
+ upcase = casing[code][2] if casing[code][2]
162
+ end
163
+ printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
164
+ code, ccclass, ex, printstr(canon),
165
+ printstr(compat), printstr(upcase), printstr(lowcase),
166
+ printstr(titlecase))
167
+ end
168
+ printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
169
+ print TAIL
data/unicode.gemspec ADDED
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new { |s|
2
+ s.name = %q{unicode}
3
+ s.version = %q{0.3.1}
4
+ s.date = %q{2010-02-26}
5
+ s.summary = %q{Unicode normalization library.}
6
+ s.require_paths = %w[lib]
7
+ s.author = %q{Yoshida Masato}
8
+ s.email = %q{yoshidam@yoshidam.net}
9
+ s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
10
+ s.files = `git ls-files`.split("\n").reject {|f| f =~ /^\./}
11
+ s.extra_rdoc_files = %w[README]
12
+ s.extensions = %w[ext/unicode/extconf.rb]
13
+ }
metadata ADDED
@@ -0,0 +1,81 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode
3
+ version: !ruby/object:Gem::Version
4
+ hash: 17
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 1
10
+ version: 0.3.1
11
+ platform: x86-mswin32-60
12
+ authors:
13
+ - Yoshida Masato
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-02-26 00:00:00 -03:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description:
23
+ email: yoshidam@yoshidam.net
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ files:
31
+ - README
32
+ - Rakefile
33
+ - ext/unicode/extconf.rb
34
+ - ext/unicode/unicode.c
35
+ - ext/unicode/unidata.map
36
+ - ext/unicode/ustring.c
37
+ - ext/unicode/ustring.h
38
+ - ext/unicode/wstring.c
39
+ - ext/unicode/wstring.h
40
+ - lib/unicode.rb
41
+ - test/test.rb
42
+ - tools/README
43
+ - tools/mkunidata.rb
44
+ - unicode.gemspec
45
+ - lib/unicode/unicode_native.so
46
+ has_rdoc: true
47
+ homepage: http://www.yoshidam.net/Ruby.html#unicode
48
+ licenses: []
49
+
50
+ post_install_message:
51
+ rdoc_options: []
52
+
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 3
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ hash: 3
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ requirements: []
74
+
75
+ rubyforge_project:
76
+ rubygems_version: 1.3.7
77
+ signing_key:
78
+ specification_version: 3
79
+ summary: Unicode normalization library.
80
+ test_files: []
81
+