unicode 0.3.1-x86-mswin32-60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +113 -0
- data/Rakefile +16 -0
- data/ext/unicode/extconf.rb +3 -0
- data/ext/unicode/unicode.c +789 -0
- data/ext/unicode/unidata.map +21854 -0
- data/ext/unicode/ustring.c +208 -0
- data/ext/unicode/ustring.h +48 -0
- data/ext/unicode/wstring.c +189 -0
- data/ext/unicode/wstring.h +41 -0
- data/lib/unicode.rb +6 -0
- data/lib/unicode/unicode_native.so +0 -0
- data/test/test.rb +69 -0
- data/tools/README +6 -0
- data/tools/mkunidata.rb +169 -0
- data/unicode.gemspec +13 -0
- metadata +81 -0
data/lib/unicode.rb
ADDED
Binary file
|
data/test/test.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'unicode'
|
5
|
+
|
6
|
+
## dump Unicode string
|
7
|
+
class String
|
8
|
+
def udump
|
9
|
+
ustr = self.unpack("U*")
|
10
|
+
ret = []
|
11
|
+
ustr.each do |e|
|
12
|
+
if e.is_a?(Integer)
|
13
|
+
ret << "U+%04X" % e
|
14
|
+
else
|
15
|
+
ret << e
|
16
|
+
end
|
17
|
+
end
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
print "Canonical decomposition vs compatibility decomposition\n"
|
24
|
+
p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
|
25
|
+
p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
|
26
|
+
|
27
|
+
print "Canonical equivalent vs Compatibility equivalent\n"
|
28
|
+
p Unicode::strcmp("ガ", "ガ")
|
29
|
+
p Unicode::strcmp("ガ", "ガ")
|
30
|
+
p Unicode::strcmp_compat("ガ", "ガ")
|
31
|
+
|
32
|
+
print "Decomposition/composition\n"
|
33
|
+
p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
|
34
|
+
p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
|
35
|
+
p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
|
36
|
+
p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
|
37
|
+
p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
|
38
|
+
p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
|
39
|
+
p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
|
40
|
+
p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
|
41
|
+
|
42
|
+
print "Kana Normalization\n"
|
43
|
+
p Unicode::normalize_D("ガガ").udump
|
44
|
+
p Unicode::normalize_C("ガガ").udump
|
45
|
+
p Unicode::normalize_KD("ガガ").udump
|
46
|
+
p Unicode::normalize_KC("ガガ").udump
|
47
|
+
|
48
|
+
print "Hangul\n"
|
49
|
+
p "요시담".udump
|
50
|
+
p Unicode::normalize_D("요시담").udump
|
51
|
+
p Unicode::normalize_C("요시담").udump
|
52
|
+
|
53
|
+
print "Composition Exclusion\n"
|
54
|
+
print " ANGSTROM SIGN [U+212B]\n"
|
55
|
+
p Unicode::normalize_D([0x212b].pack("U")).udump
|
56
|
+
p Unicode::normalize_C([0x212b].pack("U")).udump
|
57
|
+
print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
|
58
|
+
p Unicode::normalize_D([0x00c5].pack("U")).udump
|
59
|
+
p Unicode::normalize_C([0x00c5].pack("U")).udump
|
60
|
+
|
61
|
+
print "Case conversion\n"
|
62
|
+
p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
|
63
|
+
p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
|
64
|
+
p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
|
65
|
+
|
66
|
+
|
67
|
+
## Local variables:
|
68
|
+
## coding: utf-8
|
69
|
+
## End:
|
data/tools/README
ADDED
data/tools/mkunidata.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
|
3
|
+
#if $KCODE != 'UTF8'
|
4
|
+
# raise "$KCODE must be UTF8"
|
5
|
+
#end
|
6
|
+
|
7
|
+
HEAD=<<EOS
|
8
|
+
/*
|
9
|
+
* UnicodeData
|
10
|
+
* Copyright 1999, 2004, 2010 by yoshidam
|
11
|
+
*
|
12
|
+
*/
|
13
|
+
|
14
|
+
#ifndef _UNIDATA_MAP
|
15
|
+
#define _UNIDATA_MAP
|
16
|
+
|
17
|
+
struct unicode_data {
|
18
|
+
const int code;
|
19
|
+
const int combining_class;
|
20
|
+
const int exclusion;
|
21
|
+
const char* const canon;
|
22
|
+
const char* const compat;
|
23
|
+
const char* uppercase;
|
24
|
+
const char* lowercase;
|
25
|
+
const char* titlecase;
|
26
|
+
};
|
27
|
+
|
28
|
+
static const struct unicode_data unidata[] = {
|
29
|
+
EOS
|
30
|
+
|
31
|
+
TAIL=<<EOS
|
32
|
+
};
|
33
|
+
|
34
|
+
#endif
|
35
|
+
EOS
|
36
|
+
|
37
|
+
def hex2str(hex)
|
38
|
+
if hex.nil? || hex == ''
|
39
|
+
return [nil, nil]
|
40
|
+
end
|
41
|
+
canon = ""
|
42
|
+
compat = ""
|
43
|
+
chars = hex.split(" ")
|
44
|
+
if chars[0] =~ /^[0-9A-F]{4,6}$/
|
45
|
+
chars.each do |c|
|
46
|
+
canon << [c.hex].pack("U")
|
47
|
+
end
|
48
|
+
compat = canon
|
49
|
+
elsif chars[0] =~ /^<.+>$/
|
50
|
+
chars.shift
|
51
|
+
chars.each do |c|
|
52
|
+
compat << [c.hex].pack("U")
|
53
|
+
end
|
54
|
+
canon = nil
|
55
|
+
else
|
56
|
+
raise "unknown value: " + hex
|
57
|
+
end
|
58
|
+
[canon, compat]
|
59
|
+
end
|
60
|
+
|
61
|
+
def hex_or_nil(str)
|
62
|
+
return nil if str.nil? || str == ''
|
63
|
+
ret = ""
|
64
|
+
chars = str.split(" ")
|
65
|
+
chars.each do |c|
|
66
|
+
ret << [c.hex].pack("U")
|
67
|
+
end
|
68
|
+
return ret
|
69
|
+
end
|
70
|
+
|
71
|
+
def printstr(str)
|
72
|
+
return "NULL" if !str
|
73
|
+
ret = ""
|
74
|
+
str.each_byte do |c|
|
75
|
+
if c >= 32 && c < 127 && c != 34 && c != 92
|
76
|
+
ret << c
|
77
|
+
else
|
78
|
+
ret << format("\\%03o", c)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
return '"' + ret + '"'
|
82
|
+
end
|
83
|
+
|
84
|
+
## scan Composition Exclusions
|
85
|
+
exclusion = {}
|
86
|
+
open(ARGV[1]) do |f|
|
87
|
+
while l = f.gets
|
88
|
+
next if l =~ /^\#/ || l =~ /^$/
|
89
|
+
next if l !~ /Full_Composition_Exclusion/
|
90
|
+
code, = l.split(/\s/)
|
91
|
+
if code =~ /^[0-9A-F]+$/
|
92
|
+
code = code.hex
|
93
|
+
exclusion[code] = true
|
94
|
+
elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
|
95
|
+
# p [$1, $2]
|
96
|
+
scode = $1.hex
|
97
|
+
ecode = $2.hex
|
98
|
+
for code in scode..ecode
|
99
|
+
exclusion[code] = true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
## scan Special Casing
|
106
|
+
casing = {}
|
107
|
+
open(ARGV[2]) do |f|
|
108
|
+
while l = f.gets
|
109
|
+
l.chomp!
|
110
|
+
next if l =~ /^\#/ || l =~ /^$/
|
111
|
+
l =~ /^(.*)#\s*(.*)$/
|
112
|
+
l = $1
|
113
|
+
comment = $2
|
114
|
+
code,lower,title,upper,cond = l.split(/;\s/)
|
115
|
+
next if cond
|
116
|
+
lower = nil if code == lower
|
117
|
+
title = nil if code == title
|
118
|
+
upper = nil if code == upper
|
119
|
+
code = code.hex
|
120
|
+
casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
## scan UnicodeData
|
125
|
+
udata = {}
|
126
|
+
open(ARGV[0]) do |f|
|
127
|
+
while l = f.gets
|
128
|
+
l.chomp!
|
129
|
+
code, charname, gencat, ccclass, bidicat,decomp,
|
130
|
+
dec, digit, num, mirror, uni1_0, comment, upcase,
|
131
|
+
lowcase, titlecase = l.split(";", 15);
|
132
|
+
code = code.hex
|
133
|
+
ccclass = ccclass.to_i
|
134
|
+
canon, compat = hex2str(decomp)
|
135
|
+
upcase = hex_or_nil(upcase)
|
136
|
+
lowcase = hex_or_nil(lowcase)
|
137
|
+
titlecase = hex_or_nil(titlecase)
|
138
|
+
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
print HEAD
|
143
|
+
udata.sort.each do |code, data|
|
144
|
+
ccclass, canon, compat, upcase, lowcase, titlecase = data
|
145
|
+
## Exclusions
|
146
|
+
ex = 0
|
147
|
+
if exclusion[code] ## Script-specifics or Post Composition Version
|
148
|
+
ex = 1
|
149
|
+
elsif canon =~ /^.$/ ## Singltons
|
150
|
+
ex = 2
|
151
|
+
elsif !canon.nil?
|
152
|
+
starter = canon.unpack("U*")[0]
|
153
|
+
if udata[starter][0] != 0 ## Non-stater decompositions
|
154
|
+
ex = 3
|
155
|
+
end
|
156
|
+
end
|
157
|
+
## Special Casing
|
158
|
+
if casing[code]
|
159
|
+
lowcase = casing[code][0] if casing[code][0]
|
160
|
+
titlecase = casing[code][1] if casing[code][1]
|
161
|
+
upcase = casing[code][2] if casing[code][2]
|
162
|
+
end
|
163
|
+
printf(" { 0x%04x, %d, %d, %s, %s, %s, %s, %s }, \n",
|
164
|
+
code, ccclass, ex, printstr(canon),
|
165
|
+
printstr(compat), printstr(upcase), printstr(lowcase),
|
166
|
+
printstr(titlecase))
|
167
|
+
end
|
168
|
+
printf(" { -1, 0, 0, NULL, NULL, NULL, NULL, NULL }\n")
|
169
|
+
print TAIL
|
data/unicode.gemspec
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Gem::Specification.new { |s|
|
2
|
+
s.name = %q{unicode}
|
3
|
+
s.version = %q{0.3.1}
|
4
|
+
s.date = %q{2010-02-26}
|
5
|
+
s.summary = %q{Unicode normalization library.}
|
6
|
+
s.require_paths = %w[lib]
|
7
|
+
s.author = %q{Yoshida Masato}
|
8
|
+
s.email = %q{yoshidam@yoshidam.net}
|
9
|
+
s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
|
10
|
+
s.files = `git ls-files`.split("\n").reject {|f| f =~ /^\./}
|
11
|
+
s.extra_rdoc_files = %w[README]
|
12
|
+
s.extensions = %w[ext/unicode/extconf.rb]
|
13
|
+
}
|
metadata
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 17
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
- 1
|
10
|
+
version: 0.3.1
|
11
|
+
platform: x86-mswin32-60
|
12
|
+
authors:
|
13
|
+
- Yoshida Masato
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-02-26 00:00:00 -03:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description:
|
23
|
+
email: yoshidam@yoshidam.net
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
files:
|
31
|
+
- README
|
32
|
+
- Rakefile
|
33
|
+
- ext/unicode/extconf.rb
|
34
|
+
- ext/unicode/unicode.c
|
35
|
+
- ext/unicode/unidata.map
|
36
|
+
- ext/unicode/ustring.c
|
37
|
+
- ext/unicode/ustring.h
|
38
|
+
- ext/unicode/wstring.c
|
39
|
+
- ext/unicode/wstring.h
|
40
|
+
- lib/unicode.rb
|
41
|
+
- test/test.rb
|
42
|
+
- tools/README
|
43
|
+
- tools/mkunidata.rb
|
44
|
+
- unicode.gemspec
|
45
|
+
- lib/unicode/unicode_native.so
|
46
|
+
has_rdoc: true
|
47
|
+
homepage: http://www.yoshidam.net/Ruby.html#unicode
|
48
|
+
licenses: []
|
49
|
+
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
|
53
|
+
require_paths:
|
54
|
+
- lib
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 3
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
hash: 3
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
version: "0"
|
73
|
+
requirements: []
|
74
|
+
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 1.3.7
|
77
|
+
signing_key:
|
78
|
+
specification_version: 3
|
79
|
+
summary: Unicode normalization library.
|
80
|
+
test_files: []
|
81
|
+
|