unicode 0.3.1-x86-mingw32
Sign up to get free protection for your applications and to get access to all the features.
- data/README +113 -0
- data/Rakefile +16 -0
- data/ext/unicode/extconf.rb +3 -0
- data/ext/unicode/unicode.c +789 -0
- data/ext/unicode/unidata.map +21854 -0
- data/ext/unicode/ustring.c +208 -0
- data/ext/unicode/ustring.h +48 -0
- data/ext/unicode/wstring.c +189 -0
- data/ext/unicode/wstring.h +41 -0
- data/lib/unicode.rb +6 -0
- data/lib/unicode/unicode_native.so +0 -0
- data/test/test.rb +69 -0
- data/tools/README +6 -0
- data/tools/mkunidata.rb +169 -0
- data/unicode.gemspec +13 -0
- metadata +81 -0
data/lib/unicode.rb
ADDED
Binary file
|
data/test/test.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'unicode'
|
5
|
+
|
6
|
+
## dump Unicode string
|
7
|
+
class String
|
8
|
+
def udump
|
9
|
+
ustr = self.unpack("U*")
|
10
|
+
ret = []
|
11
|
+
ustr.each do |e|
|
12
|
+
if e.is_a?(Integer)
|
13
|
+
ret << "U+%04X" % e
|
14
|
+
else
|
15
|
+
ret << e
|
16
|
+
end
|
17
|
+
end
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
print "Canonical decomposition vs compatibility decomposition\n"
|
24
|
+
p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
|
25
|
+
p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
|
26
|
+
|
27
|
+
print "Canonical equivalent vs Compatibility equivalent\n"
|
28
|
+
p Unicode::strcmp("ガ", "ガ")
|
29
|
+
p Unicode::strcmp("ガ", "ガ")
|
30
|
+
p Unicode::strcmp_compat("ガ", "ガ")
|
31
|
+
|
32
|
+
print "Decomposition/composition\n"
|
33
|
+
p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
|
34
|
+
p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
|
35
|
+
p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
|
36
|
+
p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
|
37
|
+
p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
|
38
|
+
p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
|
39
|
+
p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
|
40
|
+
p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
|
41
|
+
|
42
|
+
print "Kana Normalization\n"
|
43
|
+
p Unicode::normalize_D("ガガ").udump
|
44
|
+
p Unicode::normalize_C("ガガ").udump
|
45
|
+
p Unicode::normalize_KD("ガガ").udump
|
46
|
+
p Unicode::normalize_KC("ガガ").udump
|
47
|
+
|
48
|
+
print "Hangul\n"
|
49
|
+
p "요시담".udump
|
50
|
+
p Unicode::normalize_D("요시담").udump
|
51
|
+
p Unicode::normalize_C("요시담").udump
|
52
|
+
|
53
|
+
print "Composition Exclusion\n"
|
54
|
+
print " ANGSTROM SIGN [U+212B]\n"
|
55
|
+
p Unicode::normalize_D([0x212b].pack("U")).udump
|
56
|
+
p Unicode::normalize_C([0x212b].pack("U")).udump
|
57
|
+
print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
|
58
|
+
p Unicode::normalize_D([0x00c5].pack("U")).udump
|
59
|
+
p Unicode::normalize_C([0x00c5].pack("U")).udump
|
60
|
+
|
61
|
+
print "Case conversion\n"
|
62
|
+
p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
|
63
|
+
p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
|
64
|
+
p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
|
65
|
+
|
66
|
+
|
67
|
+
## Local variables:
|
68
|
+
## coding: utf-8
|
69
|
+
## End:
|
data/tools/README
ADDED
data/tools/mkunidata.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
|
3
|
+
#if $KCODE != 'UTF8'
|
4
|
+
# raise "$KCODE must be UTF8"
|
5
|
+
#end
|
6
|
+
|
7
|
+
HEAD=<<EOS
|
8
|
+
/*
|
9
|
+
* UnicodeData
|
10
|
+
* Copyright 1999, 2004, 2010 by yoshidam
|
11
|
+
*
|
12
|
+
*/
|
13
|
+
|
14
|
+
#ifndef _UNIDATA_MAP
|
15
|
+
#define _UNIDATA_MAP
|
16
|
+
|
17
|
+
struct unicode_data {
|
18
|
+
const int code;
|
19
|
+
const int combining_class;
|
20
|
+
const int exclusion;
|
21
|
+
const char* const canon;
|
22
|
+
const char* const compat;
|
23
|
+
const char* uppercase;
|
24
|
+
const char* lowercase;
|
25
|
+
const char* titlecase;
|
26
|
+
};
|
27
|
+
|
28
|
+
static const struct unicode_data unidata[] = {
|
29
|
+
EOS
|
30
|
+
|
31
|
+
TAIL=<<EOS
|
32
|
+
};
|
33
|
+
|
34
|
+
#endif
|
35
|
+
EOS
|
36
|
+
|
37
|
+
def hex2str(hex)
|
38
|
+
if hex.nil? || hex == ''
|
39
|
+
return [nil, nil]
|
40
|
+
end
|
41
|
+
canon = ""
|
42
|
+
compat = ""
|
43
|
+
chars = hex.split(" ")
|
44
|
+
if chars[0] =~ /^[0-9A-F]{4,6}$/
|
45
|
+
chars.each do |c|
|
46
|
+
canon << [c.hex].pack("U")
|
47
|
+
end
|
48
|
+
compat = canon
|
49
|
+
elsif chars[0] =~ /^<.+>$/
|
50
|
+
chars.shift
|
51
|
+
chars.each do |c|
|
52
|
+
compat << [c.hex].pack("U")
|
53
|
+
end
|
54
|
+
canon = nil
|
55
|
+
else
|
56
|
+
raise "unknown value: " + hex
|
57
|
+
end
|
58
|
+
[canon, compat]
|
59
|
+
end
|
60
|
+
|
61
|
+
def hex_or_nil(str)
|
62
|
+
return nil if str.nil? || str == ''
|
63
|
+
ret = ""
|
64
|
+
chars = str.split(" ")
|
65
|
+
chars.each do |c|
|
66
|
+
ret << [c.hex].pack("U")
|
67
|
+
end
|
68
|
+
return ret
|
69
|
+
end
|
70
|
+
|
71
|
+
def printstr(str)
|
72
|
+
return "NULL" if !str
|
73
|
+
ret = ""
|
74
|
+
str.each_byte do |c|
|
75
|
+
if c >= 32 && c < 127 && c != 34 && c != 92
|
76
|
+
ret << c
|
77
|
+
else
|
78
|
+
ret << format("\\%03o", c)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
return '"' + ret + '"'
|
82
|
+
end
|
83
|
+
|
84
|
+
## scan Composition Exclusions
|
85
|
+
exclusion = {}
|
86
|
+
open(ARGV[1]) do |f|
|
87
|
+
while l = f.gets
|
88
|
+
next if l =~ /^\#/ || l =~ /^$/
|
89
|
+
next if l !~ /Full_Composition_Exclusion/
|
90
|
+
code, = l.split(/\s/)
|
91
|
+
if code =~ /^[0-9A-F]+$/
|
92
|
+
code = code.hex
|
93
|
+
exclusion[code] = true
|
94
|
+
elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
|
95
|
+
# p [$1, $2]
|
96
|
+
scode = $1.hex
|
97
|
+
ecode = $2.hex
|
98
|
+
for code in scode..ecode
|
99
|
+
exclusion[code] = true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
## scan Special Casing
|
106
|
+
casing = {}
|
107
|
+
open(ARGV[2]) do |f|
|
108
|
+
while l = f.gets
|
109
|
+
l.chomp!
|
110
|
+
next if l =~ /^\#/ || l =~ /^$/
|
111
|
+
l =~ /^(.*)#\s*(.*)$/
|
112
|
+
l = $1
|
113
|
+
comment = $2
|
114
|
+
code,lower,title,upper,cond = l.split(/;\s/)
|
115
|
+
next if cond
|
116
|
+
lower = nil if code == lower
|
117
|
+
title = nil if code == title
|
118
|
+
upper = nil if code == upper
|
119
|
+
code = code.hex
|
120
|
+
casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
## scan UnicodeData
|
125
|
+
udata = {}
|
126
|
+
open(ARGV[0]) do |f|
|
127
|
+
while l = f.gets
|
128
|
+
l.chomp!
|
129
|
+
code, charname, gencat, ccclass, bidicat,decomp,
|
130
|
+
dec, digit, num, mirror, uni1_0, comment, upcase,
|
131
|
+
lowcase, titlecase = l.split(";", 15);
|
132
|
+
code = code.hex
|
133
|
+
ccclass = ccclass.to_i
|
134
|
+
canon, compat = hex2str(decomp)
|
135
|
+
upcase = hex_or_nil(upcase)
|
136
|
+
lowcase = hex_or_nil(lowcase)
|
137
|
+
titlecase = hex_or_nil(titlecase)
|
138
|
+
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
print HEAD
|
143
|
+
udata.sort.each do |code, data|
|
144
|
+
ccclass, canon, compat, upcase, lowcase, titlecase = data
|
145
|
+
## Exclusions
|
146
|
+
ex = 0
|
147
|
+
if exclusion[code] ## Script-specifics or Post Composition Version
|
148
|
+
ex = 1
|
149
|
+
elsif canon =~ /^.$/ ## Singltons
|
150
|
+
ex = 2
|
151
|
+
elsif !canon.nil?
|
152
|
+
starter = canon.unpack("U*")[0]
|
153
|
+
if udata[starter][0] != 0 ## Non-stater decompositions
|
154
|
+
ex = 3
|
155
|
+
end
|
156
|
+
end
|
157
|
+
## Special Casing
|
158
|
+
if casing[code]
|
159
|
+
lowcase = casing[code][0] if casing[code][0]
|
160
|
+
titlecase = casing[code][1] if casing[code][1]
|
161
|
+
upcase = casing[code][2] if casing[code][2]
|
162
|
+
end
|
163
|
+
printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
|
164
|
+
code, ccclass, ex, printstr(canon),
|
165
|
+
printstr(compat), printstr(upcase), printstr(lowcase),
|
166
|
+
printstr(titlecase))
|
167
|
+
end
|
168
|
+
printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
|
169
|
+
print TAIL
|
data/unicode.gemspec
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Gem::Specification.new { |s|
|
2
|
+
s.name = %q{unicode}
|
3
|
+
s.version = %q{0.3.1}
|
4
|
+
s.date = %q{2010-02-26}
|
5
|
+
s.summary = %q{Unicode normalization library.}
|
6
|
+
s.require_paths = %w[lib]
|
7
|
+
s.author = %q{Yoshida Masato}
|
8
|
+
s.email = %q{yoshidam@yoshidam.net}
|
9
|
+
s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
|
10
|
+
s.files = `git ls-files`.split("\n").reject {|f| f =~ /^\./}
|
11
|
+
s.extra_rdoc_files = %w[README]
|
12
|
+
s.extensions = %w[ext/unicode/extconf.rb]
|
13
|
+
}
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 17
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
- 1
|
10
|
+
version: 0.3.1
|
11
|
+
platform: x86-mingw32
|
12
|
+
authors:
|
13
|
+
- Yoshida Masato
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-02-26 00:00:00 -03:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description:
|
23
|
+
email: yoshidam@yoshidam.net
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
files:
|
31
|
+
- README
|
32
|
+
- Rakefile
|
33
|
+
- ext/unicode/extconf.rb
|
34
|
+
- ext/unicode/unicode.c
|
35
|
+
- ext/unicode/unidata.map
|
36
|
+
- ext/unicode/ustring.c
|
37
|
+
- ext/unicode/ustring.h
|
38
|
+
- ext/unicode/wstring.c
|
39
|
+
- ext/unicode/wstring.h
|
40
|
+
- lib/unicode.rb
|
41
|
+
- test/test.rb
|
42
|
+
- tools/README
|
43
|
+
- tools/mkunidata.rb
|
44
|
+
- unicode.gemspec
|
45
|
+
- lib/unicode/unicode_native.so
|
46
|
+
has_rdoc: true
|
47
|
+
homepage: http://www.yoshidam.net/Ruby.html#unicode
|
48
|
+
licenses: []
|
49
|
+
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
|
53
|
+
require_paths:
|
54
|
+
- lib
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 3
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
hash: 3
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
version: "0"
|
73
|
+
requirements: []
|
74
|
+
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 1.3.7
|
77
|
+
signing_key:
|
78
|
+
specification_version: 3
|
79
|
+
summary: Unicode normalization library.
|
80
|
+
test_files: []
|
81
|
+
|