unicode 0.4.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README +156 -0
- data/Rakefile +117 -0
- data/lib/unicode.rb +13 -0
- data/test/test.rb +69 -0
- data/tools/README +7 -0
- data/tools/mkunidata.rb +293 -0
- data/tools/normtest.rb +111 -0
- data/unicode.gemspec +30 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 314559dfe96e14c8d0f640f1aa8498fb76fd1b23
|
4
|
+
data.tar.gz: 972889e38f2b9641c5d926b33ee5bd49a3b0df19
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 11aefd96309397148db1cfa440e520775bbc75696eb040375e7ce994781294768f6ed594123f2f131ca90bc215670681472d8aff05e54b8743932f45607160f0
|
7
|
+
data.tar.gz: db4c1e76b612de731e5868fd737bf53e7bf9a55ce12430e47e08f3bc94c8320363d066b1235a4178cd0406aabebe70edccd600198a4582d2c1d73a96951647ec
|
data/README
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
Unicode Library for Ruby
|
2
|
+
Version 0.4.4
|
3
|
+
|
4
|
+
Yoshida Masato
|
5
|
+
|
6
|
+
|
7
|
+
- Introduction
|
8
|
+
|
9
|
+
Unicode string manipulation library for Ruby.
|
10
|
+
This library is based on UAX #15 Unicode Normalization Forms(*1).
|
11
|
+
|
12
|
+
*1 <URL:http://www.unicode.org/unicode/reports/tr15/>
|
13
|
+
|
14
|
+
|
15
|
+
- Install
|
16
|
+
|
17
|
+
This can work with ruby-1.8.7 or later. I recommend you to
|
18
|
+
use ruby-1.9.3 or later.
|
19
|
+
|
20
|
+
Make and install usually.
|
21
|
+
For example, when Ruby supports dynamic linking on your OS,
|
22
|
+
|
23
|
+
ruby extconf.rb
|
24
|
+
make
|
25
|
+
make install
|
26
|
+
|
27
|
+
To install using gem, for exapmle:
|
28
|
+
|
29
|
+
gem build unicdoe.gemspac
|
30
|
+
gem install unicode
|
31
|
+
|
32
|
+
|
33
|
+
- Usage
|
34
|
+
|
35
|
+
If you do not link this module with Ruby statically,
|
36
|
+
|
37
|
+
require "unicode"
|
38
|
+
|
39
|
+
before using.
|
40
|
+
|
41
|
+
|
42
|
+
- Module Functions
|
43
|
+
|
44
|
+
All parameters of functions must be UTF-8 strings.
|
45
|
+
|
46
|
+
Unicode::strcmp(str1, str2)
|
47
|
+
Unicode::strcmp_compat(str1, str2)
|
48
|
+
Compare Unicode strings with a normalization.
|
49
|
+
strcmp uses the Normalization Form D, strcmp_compat uses
|
50
|
+
Normalization Form KD.
|
51
|
+
|
52
|
+
Unicode::decompose(str)
|
53
|
+
Unicode::decompose_compat(str)
|
54
|
+
Decompose Unicode string. Then the trailing characters
|
55
|
+
are sorted in canonical order.
|
56
|
+
decompose uses the canonical decomposition,
|
57
|
+
decompose_compat uses the compatibility decomposition.
|
58
|
+
The decomposition is based on the character decomposition
|
59
|
+
mapping in UnicodeData.txt and the Hangul decomposition
|
60
|
+
algorithm.
|
61
|
+
|
62
|
+
Unicode::decompose_safe(str)
|
63
|
+
Decompose Unicode string with a non-standard mapping.
|
64
|
+
It does not decompose the characters in
|
65
|
+
CompositionExclusions.txt.
|
66
|
+
|
67
|
+
Unicode::compose(str)
|
68
|
+
Compose Unicode string. Before composing, the trailing
|
69
|
+
characters are sorted in canonical order.
|
70
|
+
The parameter must be decomposed.
|
71
|
+
The composition is based on the reverse of the
|
72
|
+
character decomposition mapping in UnicodeData.txt,
|
73
|
+
CompositionExclusions.txt and the Hangul composition
|
74
|
+
algorithm.
|
75
|
+
|
76
|
+
Unicode::normalize_D(str) (Unicode::nfd(str))
|
77
|
+
Unicode::normalize_KD(str) (Unicode::nfkd(str))
|
78
|
+
Normalize Unicode string in form D or form KD.
|
79
|
+
These are aliases of decompose/decompose_compat.
|
80
|
+
|
81
|
+
Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
|
82
|
+
This is an alias of decompose_safe.
|
83
|
+
|
84
|
+
Unicode::normalize_C(str) (Unicode::nfc(str))
|
85
|
+
Unicode::normalize_KC(str) (Unicode::nfkc(str))
|
86
|
+
Normalize Unicode string in form C or form KC.
|
87
|
+
normalize_C = decompose + compose
|
88
|
+
normalize_KC = decompose_compat + compose
|
89
|
+
|
90
|
+
Unicode::normalize_C_safe(str) (Unicode::nfc_safe(str))
|
91
|
+
Normalize Unicode string with decompose_safe.
|
92
|
+
normalize_C_safe = decompose_safe + compose
|
93
|
+
|
94
|
+
Unicode::upcase(str)
|
95
|
+
Unicode::downcase(str)
|
96
|
+
Unicode::capitalize(str)
|
97
|
+
Case conversion functions.
|
98
|
+
The mappings that are used by these functions are not normative
|
99
|
+
in UnicodeData.txt.
|
100
|
+
|
101
|
+
Unicode::categories(str)
|
102
|
+
Unicode::abbr_categories(str)
|
103
|
+
Get an array of general category names of the string.
|
104
|
+
get_abbr_categories returns abbreviated names.
|
105
|
+
These can be called with a block.
|
106
|
+
|
107
|
+
Unicode.get_category do |category| p category end
|
108
|
+
|
109
|
+
Unicode::text_elements(str)
|
110
|
+
Get an array of text elements.
|
111
|
+
A text element is a unit that is displayed as a single character.
|
112
|
+
These can be called with a block.
|
113
|
+
|
114
|
+
Unicode::width(str[, cjk])
|
115
|
+
Estimate the display width on the fixed pitch text terminal.
|
116
|
+
It based on Markus Kuhn's mk_wcwidth.
|
117
|
+
If the optional argument 'cjk' is true, East Asian
|
118
|
+
Ambiguous characters are treated as wide characters.
|
119
|
+
|
120
|
+
Unicode.width("\u03b1") #=> 1
|
121
|
+
Unicode.width("\u03b1", true) #=> 2
|
122
|
+
|
123
|
+
|
124
|
+
- Bugs
|
125
|
+
|
126
|
+
UAX #15 suggests that the look up for Normalization Form C
|
127
|
+
should not be implemented with a hash of string for better
|
128
|
+
performance.
|
129
|
+
|
130
|
+
|
131
|
+
- Copying
|
132
|
+
|
133
|
+
This extension module is copyrighted free software by
|
134
|
+
Yoshida Masato.
|
135
|
+
|
136
|
+
You can redistribute it and/or modify it under the same
|
137
|
+
term as Ruby.
|
138
|
+
|
139
|
+
|
140
|
+
- Author
|
141
|
+
|
142
|
+
Yoshida Masato <yoshidam@yoshidam.net>
|
143
|
+
|
144
|
+
|
145
|
+
- History
|
146
|
+
|
147
|
+
Feb 7, 2013 version 0.4.4 update unidata.map for Unicode 6.2
|
148
|
+
Aug 8, 2012 version 0.4.3 add categories, text_elements and width
|
149
|
+
Feb 29, 2012 version 0.4.2 add decompose_safe
|
150
|
+
Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
|
151
|
+
Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
|
152
|
+
Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
|
153
|
+
Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
|
154
|
+
Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
|
155
|
+
Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
|
156
|
+
Nov 23, 1999 version 0.1
|
data/Rakefile
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
require "rake/clean"
|
2
|
+
require "rake/extensiontask"
|
3
|
+
require "rubygems/package_task"
|
4
|
+
|
5
|
+
CLEAN << "pkg" << "tmp" << "lib/unicode"
|
6
|
+
|
7
|
+
UPSTREAM_URL = 'http://www.yoshidam.net/unicode-%s.tar.gz'
|
8
|
+
|
9
|
+
gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
|
10
|
+
|
11
|
+
gem_task = Gem::PackageTask.new(gem_spec) {|pkg|}
|
12
|
+
|
13
|
+
Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
|
14
|
+
ext.cross_compile = true
|
15
|
+
ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
|
16
|
+
ext.ext_dir = "ext/unicode"
|
17
|
+
ext.lib_dir = "lib/unicode"
|
18
|
+
end
|
19
|
+
|
20
|
+
namespace :gem do
|
21
|
+
|
22
|
+
desc 'Build all gem files'
|
23
|
+
task :all => %w[clean gem gem:java gem:windows]
|
24
|
+
|
25
|
+
java_gem_spec = gem_spec.dup
|
26
|
+
java_gem_spec.platform = 'java'
|
27
|
+
java_gem_spec.extensions.clear
|
28
|
+
java_gem_spec.files.delete_if { |f| f.start_with?('ext/') }
|
29
|
+
|
30
|
+
directory java_gem_dir = gem_task.package_dir
|
31
|
+
|
32
|
+
java_gem_file = File.basename(java_gem_spec.cache_file)
|
33
|
+
java_gem_path = File.join(java_gem_dir, java_gem_file)
|
34
|
+
|
35
|
+
desc "Build the gem file #{java_gem_file}"
|
36
|
+
task :java => java_gem_path
|
37
|
+
|
38
|
+
file java_gem_path => [java_gem_dir] + java_gem_spec.files do
|
39
|
+
lib_file = 'lib/unicode.rb'
|
40
|
+
tmp_file = "#{lib_file}.tmp-#{$$}"
|
41
|
+
|
42
|
+
begin
|
43
|
+
mv lib_file, tmp_file
|
44
|
+
|
45
|
+
File.write(lib_file, <<-EOT)
|
46
|
+
module Unicode
|
47
|
+
|
48
|
+
extend self
|
49
|
+
|
50
|
+
def upcase(str)
|
51
|
+
str.to_java.to_upper_case
|
52
|
+
end
|
53
|
+
|
54
|
+
def downcase(str)
|
55
|
+
str.to_java.to_lower_case
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
EOT
|
60
|
+
|
61
|
+
Gem::Package.build(java_gem_spec)
|
62
|
+
|
63
|
+
mv java_gem_file, java_gem_dir
|
64
|
+
ensure
|
65
|
+
mv tmp_file, lib_file if File.exist?(tmp_file)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
desc "Build native gems for Windows"
|
70
|
+
task :windows do
|
71
|
+
ENV["RUBY_CC_VERSION"] = "1.8.7:1.9.3"
|
72
|
+
sh "rake cross compile"
|
73
|
+
sh "rake cross native gem"
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
desc "Update from upstream"
|
79
|
+
task :update, [:version] do |t, args|
|
80
|
+
require 'zlib'
|
81
|
+
require 'open-uri'
|
82
|
+
require 'archive/tar/minitar'
|
83
|
+
|
84
|
+
unless version = args.version || ENV['UPSTREAM_VERSION']
|
85
|
+
abort "Please specify UPSTREAM_VERSION. See #{gem_spec.homepage}."
|
86
|
+
end
|
87
|
+
|
88
|
+
io = begin
|
89
|
+
open(url = UPSTREAM_URL % version)
|
90
|
+
rescue OpenURI::HTTPError
|
91
|
+
abort "Upstream version not found: #{url}. See #{gem_spec.homepage}."
|
92
|
+
end
|
93
|
+
|
94
|
+
Archive::Tar::Minitar.open(Zlib::GzipReader.new(io)) { |tar|
|
95
|
+
basedir = File.expand_path('..', __FILE__)
|
96
|
+
|
97
|
+
extract = lambda { |entry, name, dir|
|
98
|
+
puts "Extracting `#{name}' to `#{dir || '.'}'..."
|
99
|
+
tar.extract_entry(dir ? File.join(basedir, dir) : basedir, entry)
|
100
|
+
}
|
101
|
+
|
102
|
+
tar.each { |entry|
|
103
|
+
entry.name.sub!(/\Aunicode\//, '')
|
104
|
+
|
105
|
+
case name = entry.full_name
|
106
|
+
when /\Atools\/|\.gemspec\z/, 'README'
|
107
|
+
extract[entry, name, nil]
|
108
|
+
when /\.(?:[ch]|map)\z/, 'extconf.rb'
|
109
|
+
extract[entry, name, 'ext/unicode']
|
110
|
+
when /\Atest/
|
111
|
+
extract[entry, name, 'test']
|
112
|
+
else
|
113
|
+
puts "Skipping `#{name}'..."
|
114
|
+
end
|
115
|
+
}
|
116
|
+
}
|
117
|
+
end
|
data/lib/unicode.rb
ADDED
data/test/test.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'unicode'
|
5
|
+
|
6
|
+
## dump Unicode string
|
7
|
+
class String
|
8
|
+
def udump
|
9
|
+
ustr = self.unpack("U*")
|
10
|
+
ret = []
|
11
|
+
ustr.each do |e|
|
12
|
+
if e.is_a?(Integer)
|
13
|
+
ret << "U+%04X" % e
|
14
|
+
else
|
15
|
+
ret << e
|
16
|
+
end
|
17
|
+
end
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
print "Canonical decomposition vs compatibility decomposition\n"
|
24
|
+
p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
|
25
|
+
p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
|
26
|
+
|
27
|
+
print "Canonical equivalent vs Compatibility equivalent\n"
|
28
|
+
p Unicode::strcmp("ガ", "ガ")
|
29
|
+
p Unicode::strcmp("ガ", "ガ")
|
30
|
+
p Unicode::strcmp_compat("ガ", "ガ")
|
31
|
+
|
32
|
+
print "Decomposition/composition\n"
|
33
|
+
p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
|
34
|
+
p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
|
35
|
+
p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
|
36
|
+
p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
|
37
|
+
p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
|
38
|
+
p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
|
39
|
+
p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
|
40
|
+
p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
|
41
|
+
|
42
|
+
print "Kana Normalization\n"
|
43
|
+
p Unicode::normalize_D("ガガ").udump
|
44
|
+
p Unicode::normalize_C("ガガ").udump
|
45
|
+
p Unicode::normalize_KD("ガガ").udump
|
46
|
+
p Unicode::normalize_KC("ガガ").udump
|
47
|
+
|
48
|
+
print "Hangul\n"
|
49
|
+
p "요시담".udump
|
50
|
+
p Unicode::normalize_D("요시담").udump
|
51
|
+
p Unicode::normalize_C("요시담").udump
|
52
|
+
|
53
|
+
print "Composition Exclusion\n"
|
54
|
+
print " ANGSTROM SIGN [U+212B]\n"
|
55
|
+
p Unicode::normalize_D([0x212b].pack("U")).udump
|
56
|
+
p Unicode::normalize_C([0x212b].pack("U")).udump
|
57
|
+
print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
|
58
|
+
p Unicode::normalize_D([0x00c5].pack("U")).udump
|
59
|
+
p Unicode::normalize_C([0x00c5].pack("U")).udump
|
60
|
+
|
61
|
+
print "Case conversion\n"
|
62
|
+
p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
|
63
|
+
p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
|
64
|
+
p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
|
65
|
+
|
66
|
+
|
67
|
+
## Local variables:
|
68
|
+
## coding: utf-8
|
69
|
+
## End:
|
data/tools/README
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
The bundled unidata.map is created from UnicodeData.txt,
|
2
|
+
DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
|
3
|
+
of Unicode 6.0.
|
4
|
+
|
5
|
+
To update unidata.map,
|
6
|
+
|
7
|
+
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
|
data/tools/mkunidata.rb
ADDED
@@ -0,0 +1,293 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
|
3
|
+
#if $KCODE != 'UTF8'
|
4
|
+
# raise "$KCODE must be UTF8"
|
5
|
+
#end
|
6
|
+
|
7
|
+
HEAD=<<EOS
|
8
|
+
/*
|
9
|
+
* UnicodeData
|
10
|
+
* Copyright 1999, 2004, 2010, 2012 by yoshidam
|
11
|
+
*
|
12
|
+
*/
|
13
|
+
|
14
|
+
#ifndef _UNIDATA_MAP
|
15
|
+
#define _UNIDATA_MAP
|
16
|
+
|
17
|
+
EOS
|
18
|
+
|
19
|
+
HEAD1=<<EOS
|
20
|
+
|
21
|
+
enum GeneralCategory {
|
22
|
+
/* Letter */
|
23
|
+
c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
|
24
|
+
/* Mark */
|
25
|
+
c_Mn, c_Mc, c_Me,
|
26
|
+
/* Number */
|
27
|
+
c_Nd, c_Nl, c_No,
|
28
|
+
/* Punctuation */
|
29
|
+
c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
|
30
|
+
/* Symbol */
|
31
|
+
c_Sm, c_Sc, c_Sk, c_So,
|
32
|
+
/* Separator */
|
33
|
+
c_Zs, c_Zl, c_Zp,
|
34
|
+
/* Other */
|
35
|
+
c_Cc, c_Cf, c_Cs, c_Co, c_Cn
|
36
|
+
};
|
37
|
+
|
38
|
+
const char* const gencat_abbr[] = {
|
39
|
+
"", /* 0 */
|
40
|
+
/* Letter */
|
41
|
+
"Lu", "Ll", "Lt", "LC", "Lm", "Lo",
|
42
|
+
/* Mark */
|
43
|
+
"Mn", "Mc", "Me",
|
44
|
+
/* Number */
|
45
|
+
"Nd", "Nl", "No",
|
46
|
+
/* Punctuation */
|
47
|
+
"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
|
48
|
+
/* Symbol */
|
49
|
+
"Sm", "Sc", "Sk", "So",
|
50
|
+
/* Separator */
|
51
|
+
"Zs", "Zl", "Zp",
|
52
|
+
/* Other */
|
53
|
+
"Cc", "Cf", "Cs", "Co", "Cn"
|
54
|
+
};
|
55
|
+
|
56
|
+
const char* const gencat_long[] = {
|
57
|
+
"",
|
58
|
+
"Uppercase_Letter",
|
59
|
+
"Lowercase_Letter",
|
60
|
+
"Titlecase_Letter",
|
61
|
+
"Cased_Letter",
|
62
|
+
"Modifier_Letter",
|
63
|
+
"Other_Letter",
|
64
|
+
"Nonspacing_Mark",
|
65
|
+
"Spacing_Mark",
|
66
|
+
"Enclosing_Mark",
|
67
|
+
"Decimal_Number",
|
68
|
+
"Letter_Number",
|
69
|
+
"Other_Number",
|
70
|
+
"Connector_Punctuation",
|
71
|
+
"Dash_Punctuation",
|
72
|
+
"Open_Punctuation",
|
73
|
+
"Close_Punctuation",
|
74
|
+
"Initial_Punctuation",
|
75
|
+
"Final_Punctuation",
|
76
|
+
"Other_Punctuation",
|
77
|
+
"Math_Symbol",
|
78
|
+
"Currency_Symbol",
|
79
|
+
"Modifier_Symbol",
|
80
|
+
"Other_Symbol",
|
81
|
+
"Space_Separator",
|
82
|
+
"Line_Separator",
|
83
|
+
"Paragraph_Separator",
|
84
|
+
"Control",
|
85
|
+
"Format",
|
86
|
+
"Surrogate",
|
87
|
+
"Private_Use",
|
88
|
+
"Unassigned"
|
89
|
+
};
|
90
|
+
|
91
|
+
enum EastAsianWidth {
|
92
|
+
w_N = 1, w_A, w_H, w_W, w_F, w_Na
|
93
|
+
};
|
94
|
+
|
95
|
+
struct unicode_data {
|
96
|
+
const int code;
|
97
|
+
const char* const canon;
|
98
|
+
const char* const compat;
|
99
|
+
const char* const uppercase;
|
100
|
+
const char* const lowercase;
|
101
|
+
const char* const titlecase;
|
102
|
+
const unsigned char combining_class;
|
103
|
+
const unsigned char exclusion;
|
104
|
+
const unsigned char general_category;
|
105
|
+
const unsigned char east_asian_width;
|
106
|
+
};
|
107
|
+
|
108
|
+
static const struct unicode_data unidata[] = {
|
109
|
+
EOS
|
110
|
+
|
111
|
+
TAIL=<<EOS
|
112
|
+
};
|
113
|
+
|
114
|
+
#endif
|
115
|
+
EOS
|
116
|
+
|
117
|
+
def hex2str(hex)
|
118
|
+
if hex.nil? || hex == ''
|
119
|
+
return [nil, nil]
|
120
|
+
end
|
121
|
+
canon = ""
|
122
|
+
compat = ""
|
123
|
+
chars = hex.split(" ")
|
124
|
+
if chars[0] =~ /^[0-9A-F]{4,6}$/
|
125
|
+
chars.each do |c|
|
126
|
+
canon << [c.hex].pack("U")
|
127
|
+
end
|
128
|
+
compat = canon
|
129
|
+
elsif chars[0] =~ /^<.+>$/
|
130
|
+
chars.shift
|
131
|
+
chars.each do |c|
|
132
|
+
compat << [c.hex].pack("U")
|
133
|
+
end
|
134
|
+
canon = nil
|
135
|
+
else
|
136
|
+
raise "unknown value: " + hex
|
137
|
+
end
|
138
|
+
[canon, compat]
|
139
|
+
end
|
140
|
+
|
141
|
+
def hex_or_nil(str)
|
142
|
+
return nil if str.nil? || str == ''
|
143
|
+
ret = ""
|
144
|
+
chars = str.split(" ")
|
145
|
+
chars.each do |c|
|
146
|
+
ret << [c.hex].pack("U")
|
147
|
+
end
|
148
|
+
return ret
|
149
|
+
end
|
150
|
+
|
151
|
+
def printstr(str)
|
152
|
+
return "NULL" if !str
|
153
|
+
ret = ""
|
154
|
+
str.each_byte do |c|
|
155
|
+
if c >= 32 && c < 127 && c != 34 && c != 92
|
156
|
+
ret << c
|
157
|
+
else
|
158
|
+
ret << format("\\%03o", c)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
return '"' + ret + '"'
|
162
|
+
end
|
163
|
+
|
164
|
+
if ARGV.length != 4
|
165
|
+
puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
|
166
|
+
exit 0
|
167
|
+
end
|
168
|
+
|
169
|
+
## scan Composition Exclusions
|
170
|
+
exclusion = {}
|
171
|
+
open(ARGV[1]) do |f|
|
172
|
+
while l = f.gets
|
173
|
+
next if l =~ /^\#/ || l =~ /^$/
|
174
|
+
next if l !~ /Full_Composition_Exclusion/
|
175
|
+
code, = l.split(/\s/)
|
176
|
+
if code =~ /^[0-9A-F]+$/
|
177
|
+
code = code.hex
|
178
|
+
exclusion[code] = true
|
179
|
+
elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
|
180
|
+
# p [$1, $2]
|
181
|
+
scode = $1.hex
|
182
|
+
ecode = $2.hex
|
183
|
+
for code in scode..ecode
|
184
|
+
exclusion[code] = true
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
## scan Special Casing
|
191
|
+
casing = {}
|
192
|
+
open(ARGV[2]) do |f|
|
193
|
+
while l = f.gets
|
194
|
+
l.chomp!
|
195
|
+
next if l =~ /^\#/ || l =~ /^$/
|
196
|
+
l =~ /^(.*)#\s*(.*)$/
|
197
|
+
l = $1
|
198
|
+
comment = $2
|
199
|
+
code,lower,title,upper,cond = l.split(/;\s/)
|
200
|
+
next if cond
|
201
|
+
lower = nil if code == lower
|
202
|
+
title = nil if code == title
|
203
|
+
upper = nil if code == upper
|
204
|
+
code = code.hex
|
205
|
+
casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
## scan UnicodeData
|
210
|
+
udata = {}
|
211
|
+
range_data = []
|
212
|
+
open(ARGV[0]) do |f|
|
213
|
+
while l = f.gets
|
214
|
+
l.chomp!
|
215
|
+
code, charname, gencat, ccclass, bidicat,decomp,
|
216
|
+
dec, digit, num, mirror, uni1_0, comment, upcase,
|
217
|
+
lowcase, titlecase = l.split(";", 15);
|
218
|
+
code = code.hex
|
219
|
+
ccclass = ccclass.to_i
|
220
|
+
canon, compat = hex2str(decomp)
|
221
|
+
upcase = hex_or_nil(upcase)
|
222
|
+
lowcase = hex_or_nil(lowcase)
|
223
|
+
titlecase = hex_or_nil(titlecase)
|
224
|
+
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
|
225
|
+
if charname =~ /^<(.*, (First|Last))>$/
|
226
|
+
charname = $1.upcase.gsub(/,? /, '_')
|
227
|
+
range_data << [charname, code]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
## scan EastAsianWidth
|
233
|
+
ea_width = {}
|
234
|
+
open(ARGV[3]) do |f|
|
235
|
+
while l = f.gets
|
236
|
+
l.chomp!
|
237
|
+
next if l =~ /^\#/ || l =~ /^$/
|
238
|
+
l =~ /^(.*)\s+#\s*(.*)$/
|
239
|
+
l = $1
|
240
|
+
comment = $2
|
241
|
+
code,width = l.split(/;/)
|
242
|
+
if code =~ /\.\./
|
243
|
+
start_code, end_code = code.split('..')
|
244
|
+
start_code = start_code.hex
|
245
|
+
end_code = end_code.hex
|
246
|
+
(start_code..end_code).each do |code|
|
247
|
+
ea_width[code] = width
|
248
|
+
end
|
249
|
+
next
|
250
|
+
end
|
251
|
+
code = code.hex
|
252
|
+
ea_width[code] = width
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
print HEAD
|
257
|
+
range_data.each do |charname, code|
|
258
|
+
printf("#define %s\t(0x%04x)\n", charname, code)
|
259
|
+
end
|
260
|
+
|
261
|
+
print HEAD1
|
262
|
+
udata.sort.each do |code, data|
|
263
|
+
ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
|
264
|
+
## Exclusions
|
265
|
+
ex = 0
|
266
|
+
if exclusion[code] ## Script-specifics or Post Composition Version
|
267
|
+
ex = 1
|
268
|
+
elsif canon =~ /^.$/ ## Singltons
|
269
|
+
ex = 2
|
270
|
+
elsif !canon.nil?
|
271
|
+
starter = canon.unpack("U*")[0]
|
272
|
+
if udata[starter][0] != 0 ## Non-stater decompositions
|
273
|
+
ex = 3
|
274
|
+
end
|
275
|
+
end
|
276
|
+
## Special Casing
|
277
|
+
if casing[code]
|
278
|
+
lowcase = casing[code][0] if casing[code][0]
|
279
|
+
titlecase = casing[code][1] if casing[code][1]
|
280
|
+
upcase = casing[code][2] if casing[code][2]
|
281
|
+
end
|
282
|
+
width = 'N'
|
283
|
+
if ea_width[code]
|
284
|
+
width = ea_width[code]
|
285
|
+
end
|
286
|
+
|
287
|
+
printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
|
288
|
+
code, printstr(canon),
|
289
|
+
printstr(compat), printstr(upcase), printstr(lowcase),
|
290
|
+
printstr(titlecase), ccclass, ex, gencat, width)
|
291
|
+
end
|
292
|
+
printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
|
293
|
+
print TAIL
|
data/tools/normtest.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
|
3
|
+
## Conformance test with NormaliztionTest.txt
|
4
|
+
## Copyrigth 2010 yoshidam
|
5
|
+
|
6
|
+
require 'unicode'
|
7
|
+
|
8
|
+
TESTFILE = "NormalizationTest.txt"
|
9
|
+
|
10
|
+
def from_hex(str)
|
11
|
+
ret = ""
|
12
|
+
chars = str.split(" ")
|
13
|
+
chars.each do |c|
|
14
|
+
ret << [c.hex].pack("U")
|
15
|
+
end
|
16
|
+
return ret
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_hex(str)
|
20
|
+
ret = ""
|
21
|
+
str = str.unpack('U*')
|
22
|
+
str.each do |c|
|
23
|
+
ret += sprintf("%04X ", c)
|
24
|
+
end
|
25
|
+
ret
|
26
|
+
end
|
27
|
+
|
28
|
+
open(TESTFILE) do |f|
|
29
|
+
while l = f.gets
|
30
|
+
next if l =~ /^#/
|
31
|
+
l.chomp
|
32
|
+
if l =~ /^@/
|
33
|
+
puts l
|
34
|
+
next
|
35
|
+
end
|
36
|
+
c1, c2, c3, c4, c5 = l.split(';')
|
37
|
+
code = c1
|
38
|
+
c1 = from_hex(c1)
|
39
|
+
c2 = from_hex(c2)
|
40
|
+
c3 = from_hex(c3)
|
41
|
+
c4 = from_hex(c4)
|
42
|
+
c5 = from_hex(c5)
|
43
|
+
## NFC TEST
|
44
|
+
if c2 == Unicode.nfc(c1) && c2 == Unicode.nfc(c2) &&
|
45
|
+
c2 == Unicode.nfc(c3) &&
|
46
|
+
c4 == Unicode.nfc(c4) && c4 == Unicode.nfc(c4)
|
47
|
+
##puts "NFC OK: " + code
|
48
|
+
else
|
49
|
+
puts "NFC NG: " + to_hex(c1)
|
50
|
+
printf(" c2=%s NFC(c1)=%s NFC(c2)=%s NFC(c3)=%s\n",
|
51
|
+
to_hex(c2),
|
52
|
+
to_hex(Unicode.nfc(c1)),
|
53
|
+
to_hex(Unicode.nfc(c2)),
|
54
|
+
to_hex(Unicode.nfc(c3)))
|
55
|
+
printf(" c4=%s NFC(c4)=%s NFC(c5)=%s\n",
|
56
|
+
to_hex(c4),
|
57
|
+
to_hex(Unicode.nfc(c4)),
|
58
|
+
to_hex(Unicode.nfc(c5)))
|
59
|
+
end
|
60
|
+
|
61
|
+
## NFD TEST
|
62
|
+
if c3 == Unicode.nfd(c1) && c3 == Unicode.nfd(c2) &&
|
63
|
+
c3 == Unicode.nfd(c3) &&
|
64
|
+
c5 == Unicode.nfd(c4) && c5 == Unicode.nfd(c5)
|
65
|
+
##puts "NFD OK: " + code
|
66
|
+
else
|
67
|
+
puts "NFD NG: " + to_hex(c1)
|
68
|
+
printf(" c3=%s NFD(c1)=%s NFD(c2)=%s NFD(c3)=%s\n",
|
69
|
+
to_hex(c3),
|
70
|
+
to_hex(Unicode.nfd(c1)),
|
71
|
+
to_hex(Unicode.nfd(c2)),
|
72
|
+
to_hex(Unicode.nfd(c3)))
|
73
|
+
printf(" c5=%s NFD(c4)=%s NFD(c5)=%s\n",
|
74
|
+
to_hex(c5),
|
75
|
+
to_hex(Unicode.nfd(c4)),
|
76
|
+
to_hex(Unicode.nfd(c5)))
|
77
|
+
end
|
78
|
+
|
79
|
+
## NFKC TEST
|
80
|
+
if c4 == Unicode.nfkc(c1) && c4 == Unicode.nfkc(c2) &&
|
81
|
+
c4 == Unicode.nfkc(c3) &&
|
82
|
+
c4 == Unicode.nfkc(c4) && c4 == Unicode.nfkc(c5)
|
83
|
+
##puts "NFKC OK: " + code
|
84
|
+
else
|
85
|
+
puts "NFKC NG: " + to_hex(c1)
|
86
|
+
printf(" c4=%s NFKC(c1)=%s NFKC(c2)=%s NFKC(c3)=%s NFKC(c4)=%s NFKC(c5)=%s\n",
|
87
|
+
to_hex(c4),
|
88
|
+
to_hex(Unicode.nfkc(c1)),
|
89
|
+
to_hex(Unicode.nfkc(c2)),
|
90
|
+
to_hex(Unicode.nfkc(c3)),
|
91
|
+
to_hex(Unicode.nfkc(c4)),
|
92
|
+
to_hex(Unicode.nfkc(c5)))
|
93
|
+
end
|
94
|
+
|
95
|
+
## NFKD TEST
|
96
|
+
if c5 == Unicode.nfkd(c1) && c5 == Unicode.nfkd(c2) &&
|
97
|
+
c5 == Unicode.nfkd(c3) &&
|
98
|
+
c5 == Unicode.nfkd(c4) && c5 == Unicode.nfkd(c5)
|
99
|
+
##puts "NFKD OK: " + code
|
100
|
+
else
|
101
|
+
puts "NFKD NG: " + to_hex(c1)
|
102
|
+
printf(" c5=%s NFKD(c1)=%s NFKD(c2)=%s NFKD(c3)=%s NFKD(c4)=%s NFKD(c5)=%s\n",
|
103
|
+
to_hex(c5),
|
104
|
+
to_hex(Unicode.nfkd(c1)),
|
105
|
+
to_hex(Unicode.nfkd(c2)),
|
106
|
+
to_hex(Unicode.nfkd(c3)),
|
107
|
+
to_hex(Unicode.nfkd(c4)),
|
108
|
+
to_hex(Unicode.nfkd(c5)))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
data/unicode.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{unicode}
|
5
|
+
s.version = "0.4.4"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = [%q{Yoshida Masato}]
|
9
|
+
s.date = %q{2013-02-07}
|
10
|
+
s.email = %q{yoshidam@yoshidam.net}
|
11
|
+
s.licenses = %w[Ruby]
|
12
|
+
s.extensions = %w[ext/unicode/extconf.rb]
|
13
|
+
s.extra_rdoc_files = [%q{README}]
|
14
|
+
s.files = %w[
|
15
|
+
README Rakefile unicode.gemspec lib/unicode.rb
|
16
|
+
test/test.rb tools/README tools/mkunidata.rb tools/normtest.rb
|
17
|
+
ext/unicode/extconf.rb ext/unicode/unicode.c ext/unicode/unidata.map
|
18
|
+
ext/unicode/ustring.c ext/unicode/ustring.h ext/unicode/wstring.c ext/unicode/wstring.h
|
19
|
+
]
|
20
|
+
s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
|
21
|
+
s.require_paths = [%q{lib}]
|
22
|
+
s.rubygems_version = %q{1.8.6}
|
23
|
+
s.summary = %q{Unicode normalization library.}
|
24
|
+
s.description = %q{Unicode normalization library.}
|
25
|
+
|
26
|
+
if s.respond_to? :specification_version then
|
27
|
+
s.specification_version = 3
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.4
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Yoshida Masato
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-02-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Unicode normalization library.
|
14
|
+
email: yoshidam@yoshidam.net
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files:
|
18
|
+
- README
|
19
|
+
files:
|
20
|
+
- README
|
21
|
+
- Rakefile
|
22
|
+
- lib/unicode.rb
|
23
|
+
- test/test.rb
|
24
|
+
- tools/README
|
25
|
+
- tools/mkunidata.rb
|
26
|
+
- tools/normtest.rb
|
27
|
+
- unicode.gemspec
|
28
|
+
homepage: http://www.yoshidam.net/Ruby.html#unicode
|
29
|
+
licenses:
|
30
|
+
- Ruby
|
31
|
+
metadata: {}
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
requirements: []
|
47
|
+
rubyforge_project:
|
48
|
+
rubygems_version: 2.4.4
|
49
|
+
signing_key:
|
50
|
+
specification_version: 3
|
51
|
+
summary: Unicode normalization library.
|
52
|
+
test_files: []
|