unicode 0.4.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README +156 -0
- data/Rakefile +117 -0
- data/lib/unicode.rb +13 -0
- data/test/test.rb +69 -0
- data/tools/README +7 -0
- data/tools/mkunidata.rb +293 -0
- data/tools/normtest.rb +111 -0
- data/unicode.gemspec +30 -0
- metadata +52 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 314559dfe96e14c8d0f640f1aa8498fb76fd1b23
|
4
|
+
data.tar.gz: 972889e38f2b9641c5d926b33ee5bd49a3b0df19
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 11aefd96309397148db1cfa440e520775bbc75696eb040375e7ce994781294768f6ed594123f2f131ca90bc215670681472d8aff05e54b8743932f45607160f0
|
7
|
+
data.tar.gz: db4c1e76b612de731e5868fd737bf53e7bf9a55ce12430e47e08f3bc94c8320363d066b1235a4178cd0406aabebe70edccd600198a4582d2c1d73a96951647ec
|
data/README
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
Unicode Library for Ruby
|
2
|
+
Version 0.4.4
|
3
|
+
|
4
|
+
Yoshida Masato
|
5
|
+
|
6
|
+
|
7
|
+
- Introduction
|
8
|
+
|
9
|
+
Unicode string manipulation library for Ruby.
|
10
|
+
This library is based on UAX #15 Unicode Normalization Forms(*1).
|
11
|
+
|
12
|
+
*1 <URL:http://www.unicode.org/unicode/reports/tr15/>
|
13
|
+
|
14
|
+
|
15
|
+
- Install
|
16
|
+
|
17
|
+
This can work with ruby-1.8.7 or later. I recommend you to
|
18
|
+
use ruby-1.9.3 or later.
|
19
|
+
|
20
|
+
Make and install usually.
|
21
|
+
For example, when Ruby supports dynamic linking on your OS,
|
22
|
+
|
23
|
+
ruby extconf.rb
|
24
|
+
make
|
25
|
+
make install
|
26
|
+
|
27
|
+
To install using gem, for exapmle:
|
28
|
+
|
29
|
+
gem build unicdoe.gemspac
|
30
|
+
gem install unicode
|
31
|
+
|
32
|
+
|
33
|
+
- Usage
|
34
|
+
|
35
|
+
If you do not link this module with Ruby statically,
|
36
|
+
|
37
|
+
require "unicode"
|
38
|
+
|
39
|
+
before using.
|
40
|
+
|
41
|
+
|
42
|
+
- Module Functions
|
43
|
+
|
44
|
+
All parameters of functions must be UTF-8 strings.
|
45
|
+
|
46
|
+
Unicode::strcmp(str1, str2)
|
47
|
+
Unicode::strcmp_compat(str1, str2)
|
48
|
+
Compare Unicode strings with a normalization.
|
49
|
+
strcmp uses the Normalization Form D, strcmp_compat uses
|
50
|
+
Normalization Form KD.
|
51
|
+
|
52
|
+
Unicode::decompose(str)
|
53
|
+
Unicode::decompose_compat(str)
|
54
|
+
Decompose Unicode string. Then the trailing characters
|
55
|
+
are sorted in canonical order.
|
56
|
+
decompose uses the canonical decomposition,
|
57
|
+
decompose_compat uses the compatibility decomposition.
|
58
|
+
The decomposition is based on the character decomposition
|
59
|
+
mapping in UnicodeData.txt and the Hangul decomposition
|
60
|
+
algorithm.
|
61
|
+
|
62
|
+
Unicode::decompose_safe(str)
|
63
|
+
Decompose Unicode string with a non-standard mapping.
|
64
|
+
It does not decompose the characters in
|
65
|
+
CompositionExclusions.txt.
|
66
|
+
|
67
|
+
Unicode::compose(str)
|
68
|
+
Compose Unicode string. Before composing, the trailing
|
69
|
+
characters are sorted in canonical order.
|
70
|
+
The parameter must be decomposed.
|
71
|
+
The composition is based on the reverse of the
|
72
|
+
character decomposition mapping in UnicodeData.txt,
|
73
|
+
CompositionExclusions.txt and the Hangul composition
|
74
|
+
algorithm.
|
75
|
+
|
76
|
+
Unicode::normalize_D(str) (Unicode::nfd(str))
|
77
|
+
Unicode::normalize_KD(str) (Unicode::nfkd(str))
|
78
|
+
Normalize Unicode string in form D or form KD.
|
79
|
+
These are aliases of decompose/decompose_compat.
|
80
|
+
|
81
|
+
Unicode::normalize_D_safe(str) (Unicode::nfd_safe(str))
|
82
|
+
This is an alias of decompose_safe.
|
83
|
+
|
84
|
+
Unicode::normalize_C(str) (Unicode::nfc(str))
|
85
|
+
Unicode::normalize_KC(str) (Unicode::nfkc(str))
|
86
|
+
Normalize Unicode string in form C or form KC.
|
87
|
+
normalize_C = decompose + compose
|
88
|
+
normalize_KC = decompose_compat + compose
|
89
|
+
|
90
|
+
Unicode::normalize_C_safe(str) (Unicode::nfc_safe(str))
|
91
|
+
Normalize Unicode string with decompose_safe.
|
92
|
+
normalize_C_safe = decompose_safe + compose
|
93
|
+
|
94
|
+
Unicode::upcase(str)
|
95
|
+
Unicode::downcase(str)
|
96
|
+
Unicode::capitalize(str)
|
97
|
+
Case conversion functions.
|
98
|
+
The mappings that are used by these functions are not normative
|
99
|
+
in UnicodeData.txt.
|
100
|
+
|
101
|
+
Unicode::categories(str)
|
102
|
+
Unicode::abbr_categories(str)
|
103
|
+
Get an array of general category names of the string.
|
104
|
+
get_abbr_categories returns abbreviated names.
|
105
|
+
These can be called with a block.
|
106
|
+
|
107
|
+
Unicode.get_category do |category| p category end
|
108
|
+
|
109
|
+
Unicode::text_elements(str)
|
110
|
+
Get an array of text elements.
|
111
|
+
A text element is a unit that is displayed as a single character.
|
112
|
+
These can be called with a block.
|
113
|
+
|
114
|
+
Unicode::width(str[, cjk])
|
115
|
+
Estimate the display width on the fixed pitch text terminal.
|
116
|
+
It based on Markus Kuhn's mk_wcwidth.
|
117
|
+
If the optional argument 'cjk' is true, East Asian
|
118
|
+
Ambiguous characters are treated as wide characters.
|
119
|
+
|
120
|
+
Unicode.width("\u03b1") #=> 1
|
121
|
+
Unicode.width("\u03b1", true) #=> 2
|
122
|
+
|
123
|
+
|
124
|
+
- Bugs
|
125
|
+
|
126
|
+
UAX #15 suggests that the look up for Normalization Form C
|
127
|
+
should not be implemented with a hash of string for better
|
128
|
+
performance.
|
129
|
+
|
130
|
+
|
131
|
+
- Copying
|
132
|
+
|
133
|
+
This extension module is copyrighted free software by
|
134
|
+
Yoshida Masato.
|
135
|
+
|
136
|
+
You can redistribute it and/or modify it under the same
|
137
|
+
term as Ruby.
|
138
|
+
|
139
|
+
|
140
|
+
- Author
|
141
|
+
|
142
|
+
Yoshida Masato <yoshidam@yoshidam.net>
|
143
|
+
|
144
|
+
|
145
|
+
- History
|
146
|
+
|
147
|
+
Feb 7, 2013 version 0.4.4 update unidata.map for Unicode 6.2
|
148
|
+
Aug 8, 2012 version 0.4.3 add categories, text_elements and width
|
149
|
+
Feb 29, 2012 version 0.4.2 add decompose_safe
|
150
|
+
Feb 3, 2012 version 0.4.1 update unidata.map for Unicode 6.1
|
151
|
+
Oct 14, 2010 version 0.4.0 fix the composition algorithm, and support Unicode 6.0
|
152
|
+
Feb 26, 2010 version 0.3.0 fix a capitalize bug and support SpecialCasing
|
153
|
+
Dec 29, 2009 version 0.2.0 update for Ruby 1.9.1 and Unicode 5.2
|
154
|
+
Sep 10, 2005 version 0.1.2 update unidata.map for Unicode 4.1.0
|
155
|
+
Aug 26, 2004 version 0.1.1 update unidata.map for Unicode 4.0.1
|
156
|
+
Nov 23, 1999 version 0.1
|
data/Rakefile
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
require "rake/clean"
|
2
|
+
require "rake/extensiontask"
|
3
|
+
require "rubygems/package_task"
|
4
|
+
|
5
|
+
CLEAN << "pkg" << "tmp" << "lib/unicode"
|
6
|
+
|
7
|
+
UPSTREAM_URL = 'http://www.yoshidam.net/unicode-%s.tar.gz'
|
8
|
+
|
9
|
+
gem_spec = eval(File.read(File.expand_path("../unicode.gemspec", __FILE__)))
|
10
|
+
|
11
|
+
gem_task = Gem::PackageTask.new(gem_spec) {|pkg|}
|
12
|
+
|
13
|
+
Rake::ExtensionTask.new('unicode_native', gem_spec) do |ext|
|
14
|
+
ext.cross_compile = true
|
15
|
+
ext.cross_platform = ['x86-mingw32', 'x86-mswin32-60']
|
16
|
+
ext.ext_dir = "ext/unicode"
|
17
|
+
ext.lib_dir = "lib/unicode"
|
18
|
+
end
|
19
|
+
|
20
|
+
namespace :gem do
|
21
|
+
|
22
|
+
desc 'Build all gem files'
|
23
|
+
task :all => %w[clean gem gem:java gem:windows]
|
24
|
+
|
25
|
+
java_gem_spec = gem_spec.dup
|
26
|
+
java_gem_spec.platform = 'java'
|
27
|
+
java_gem_spec.extensions.clear
|
28
|
+
java_gem_spec.files.delete_if { |f| f.start_with?('ext/') }
|
29
|
+
|
30
|
+
directory java_gem_dir = gem_task.package_dir
|
31
|
+
|
32
|
+
java_gem_file = File.basename(java_gem_spec.cache_file)
|
33
|
+
java_gem_path = File.join(java_gem_dir, java_gem_file)
|
34
|
+
|
35
|
+
desc "Build the gem file #{java_gem_file}"
|
36
|
+
task :java => java_gem_path
|
37
|
+
|
38
|
+
file java_gem_path => [java_gem_dir] + java_gem_spec.files do
|
39
|
+
lib_file = 'lib/unicode.rb'
|
40
|
+
tmp_file = "#{lib_file}.tmp-#{$$}"
|
41
|
+
|
42
|
+
begin
|
43
|
+
mv lib_file, tmp_file
|
44
|
+
|
45
|
+
File.write(lib_file, <<-EOT)
|
46
|
+
module Unicode
|
47
|
+
|
48
|
+
extend self
|
49
|
+
|
50
|
+
def upcase(str)
|
51
|
+
str.to_java.to_upper_case
|
52
|
+
end
|
53
|
+
|
54
|
+
def downcase(str)
|
55
|
+
str.to_java.to_lower_case
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
EOT
|
60
|
+
|
61
|
+
Gem::Package.build(java_gem_spec)
|
62
|
+
|
63
|
+
mv java_gem_file, java_gem_dir
|
64
|
+
ensure
|
65
|
+
mv tmp_file, lib_file if File.exist?(tmp_file)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
desc "Build native gems for Windows"
|
70
|
+
task :windows do
|
71
|
+
ENV["RUBY_CC_VERSION"] = "1.8.7:1.9.3"
|
72
|
+
sh "rake cross compile"
|
73
|
+
sh "rake cross native gem"
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
desc "Update from upstream"
|
79
|
+
task :update, [:version] do |t, args|
|
80
|
+
require 'zlib'
|
81
|
+
require 'open-uri'
|
82
|
+
require 'archive/tar/minitar'
|
83
|
+
|
84
|
+
unless version = args.version || ENV['UPSTREAM_VERSION']
|
85
|
+
abort "Please specify UPSTREAM_VERSION. See #{gem_spec.homepage}."
|
86
|
+
end
|
87
|
+
|
88
|
+
io = begin
|
89
|
+
open(url = UPSTREAM_URL % version)
|
90
|
+
rescue OpenURI::HTTPError
|
91
|
+
abort "Upstream version not found: #{url}. See #{gem_spec.homepage}."
|
92
|
+
end
|
93
|
+
|
94
|
+
Archive::Tar::Minitar.open(Zlib::GzipReader.new(io)) { |tar|
|
95
|
+
basedir = File.expand_path('..', __FILE__)
|
96
|
+
|
97
|
+
extract = lambda { |entry, name, dir|
|
98
|
+
puts "Extracting `#{name}' to `#{dir || '.'}'..."
|
99
|
+
tar.extract_entry(dir ? File.join(basedir, dir) : basedir, entry)
|
100
|
+
}
|
101
|
+
|
102
|
+
tar.each { |entry|
|
103
|
+
entry.name.sub!(/\Aunicode\//, '')
|
104
|
+
|
105
|
+
case name = entry.full_name
|
106
|
+
when /\Atools\/|\.gemspec\z/, 'README'
|
107
|
+
extract[entry, name, nil]
|
108
|
+
when /\.(?:[ch]|map)\z/, 'extconf.rb'
|
109
|
+
extract[entry, name, 'ext/unicode']
|
110
|
+
when /\Atest/
|
111
|
+
extract[entry, name, 'test']
|
112
|
+
else
|
113
|
+
puts "Skipping `#{name}'..."
|
114
|
+
end
|
115
|
+
}
|
116
|
+
}
|
117
|
+
end
|
data/lib/unicode.rb
ADDED
data/test/test.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'unicode'
|
5
|
+
|
6
|
+
## dump Unicode string
|
7
|
+
class String
|
8
|
+
def udump
|
9
|
+
ustr = self.unpack("U*")
|
10
|
+
ret = []
|
11
|
+
ustr.each do |e|
|
12
|
+
if e.is_a?(Integer)
|
13
|
+
ret << "U+%04X" % e
|
14
|
+
else
|
15
|
+
ret << e
|
16
|
+
end
|
17
|
+
end
|
18
|
+
ret
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
print "Canonical decomposition vs compatibility decomposition\n"
|
24
|
+
p Unicode::decompose("⑽ o\xef\xac\x83ce").udump
|
25
|
+
p Unicode::decompose_compat("⑽ o\xef\xac\x83ce")
|
26
|
+
|
27
|
+
print "Canonical equivalent vs Compatibility equivalent\n"
|
28
|
+
p Unicode::strcmp("ガ", "ガ")
|
29
|
+
p Unicode::strcmp("ガ", "ガ")
|
30
|
+
p Unicode::strcmp_compat("ガ", "ガ")
|
31
|
+
|
32
|
+
print "Decomposition/composition\n"
|
33
|
+
p Unicode::normalize_D([0x63, 0x301, 0x327].pack("U*")).udump
|
34
|
+
p Unicode::normalize_D([0x63, 0x327, 0x301].pack("U*")).udump
|
35
|
+
p Unicode::normalize_D([0x107, 0x327].pack("U*")).udump
|
36
|
+
p Unicode::normalize_D([0xe7, 0x301].pack("U*")).udump
|
37
|
+
p Unicode::normalize_C([0x63, 0x301, 0x327].pack("U*")).udump
|
38
|
+
p Unicode::normalize_C([0x63, 0x327, 0x301].pack("U*")).udump
|
39
|
+
p Unicode::normalize_C([0x107, 0x327].pack("U*")).udump
|
40
|
+
p Unicode::normalize_C([0xe7, 0x301].pack("U*")).udump
|
41
|
+
|
42
|
+
print "Kana Normalization\n"
|
43
|
+
p Unicode::normalize_D("ガガ").udump
|
44
|
+
p Unicode::normalize_C("ガガ").udump
|
45
|
+
p Unicode::normalize_KD("ガガ").udump
|
46
|
+
p Unicode::normalize_KC("ガガ").udump
|
47
|
+
|
48
|
+
print "Hangul\n"
|
49
|
+
p "요시담".udump
|
50
|
+
p Unicode::normalize_D("요시담").udump
|
51
|
+
p Unicode::normalize_C("요시담").udump
|
52
|
+
|
53
|
+
print "Composition Exclusion\n"
|
54
|
+
print " ANGSTROM SIGN [U+212B]\n"
|
55
|
+
p Unicode::normalize_D([0x212b].pack("U")).udump
|
56
|
+
p Unicode::normalize_C([0x212b].pack("U")).udump
|
57
|
+
print " LATIN CAPITAL LETTER A WITH RING ABOVE [U+00C5]\n"
|
58
|
+
p Unicode::normalize_D([0x00c5].pack("U")).udump
|
59
|
+
p Unicode::normalize_C([0x00c5].pack("U")).udump
|
60
|
+
|
61
|
+
print "Case conversion\n"
|
62
|
+
p Unicode::normalize_C(Unicode::upcase([0x63, 0x301, 0x327, 0xff41].pack("U*"))).udump
|
63
|
+
p Unicode::normalize_C(Unicode::downcase([0x43, 0x301, 0x327, 0xff21].pack("U*"))).udump
|
64
|
+
p Unicode::capitalize([0x1f1, 0x41, 0x61, 0xff21].pack("U*")).udump
|
65
|
+
|
66
|
+
|
67
|
+
## Local variables:
|
68
|
+
## coding: utf-8
|
69
|
+
## End:
|
data/tools/README
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
The bundled unidata.map is created from UnicodeData.txt,
|
2
|
+
DerivedNormalizationProps.txt, SpecialCasing.txt and EastAsianWidth.txt
|
3
|
+
of Unicode 6.0.
|
4
|
+
|
5
|
+
To update unidata.map,
|
6
|
+
|
7
|
+
ruby mkunidata.rb UnicodeData.txt DerivedNormalizationProps.txt SpecialCasing.txt EastAsianWidth.txt > unidata.map
|
data/tools/mkunidata.rb
ADDED
@@ -0,0 +1,293 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
|
3
|
+
#if $KCODE != 'UTF8'
|
4
|
+
# raise "$KCODE must be UTF8"
|
5
|
+
#end
|
6
|
+
|
7
|
+
HEAD=<<EOS
|
8
|
+
/*
|
9
|
+
* UnicodeData
|
10
|
+
* Copyright 1999, 2004, 2010, 2012 by yoshidam
|
11
|
+
*
|
12
|
+
*/
|
13
|
+
|
14
|
+
#ifndef _UNIDATA_MAP
|
15
|
+
#define _UNIDATA_MAP
|
16
|
+
|
17
|
+
EOS
|
18
|
+
|
19
|
+
HEAD1=<<EOS
|
20
|
+
|
21
|
+
enum GeneralCategory {
|
22
|
+
/* Letter */
|
23
|
+
c_Lu = 1, c_Ll, c_Lt, c_LC, c_Lm, c_Lo,
|
24
|
+
/* Mark */
|
25
|
+
c_Mn, c_Mc, c_Me,
|
26
|
+
/* Number */
|
27
|
+
c_Nd, c_Nl, c_No,
|
28
|
+
/* Punctuation */
|
29
|
+
c_Pc, c_Pd, c_Ps, c_Pe, c_Pi, c_Pf, c_Po,
|
30
|
+
/* Symbol */
|
31
|
+
c_Sm, c_Sc, c_Sk, c_So,
|
32
|
+
/* Separator */
|
33
|
+
c_Zs, c_Zl, c_Zp,
|
34
|
+
/* Other */
|
35
|
+
c_Cc, c_Cf, c_Cs, c_Co, c_Cn
|
36
|
+
};
|
37
|
+
|
38
|
+
const char* const gencat_abbr[] = {
|
39
|
+
"", /* 0 */
|
40
|
+
/* Letter */
|
41
|
+
"Lu", "Ll", "Lt", "LC", "Lm", "Lo",
|
42
|
+
/* Mark */
|
43
|
+
"Mn", "Mc", "Me",
|
44
|
+
/* Number */
|
45
|
+
"Nd", "Nl", "No",
|
46
|
+
/* Punctuation */
|
47
|
+
"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
|
48
|
+
/* Symbol */
|
49
|
+
"Sm", "Sc", "Sk", "So",
|
50
|
+
/* Separator */
|
51
|
+
"Zs", "Zl", "Zp",
|
52
|
+
/* Other */
|
53
|
+
"Cc", "Cf", "Cs", "Co", "Cn"
|
54
|
+
};
|
55
|
+
|
56
|
+
const char* const gencat_long[] = {
|
57
|
+
"",
|
58
|
+
"Uppercase_Letter",
|
59
|
+
"Lowercase_Letter",
|
60
|
+
"Titlecase_Letter",
|
61
|
+
"Cased_Letter",
|
62
|
+
"Modifier_Letter",
|
63
|
+
"Other_Letter",
|
64
|
+
"Nonspacing_Mark",
|
65
|
+
"Spacing_Mark",
|
66
|
+
"Enclosing_Mark",
|
67
|
+
"Decimal_Number",
|
68
|
+
"Letter_Number",
|
69
|
+
"Other_Number",
|
70
|
+
"Connector_Punctuation",
|
71
|
+
"Dash_Punctuation",
|
72
|
+
"Open_Punctuation",
|
73
|
+
"Close_Punctuation",
|
74
|
+
"Initial_Punctuation",
|
75
|
+
"Final_Punctuation",
|
76
|
+
"Other_Punctuation",
|
77
|
+
"Math_Symbol",
|
78
|
+
"Currency_Symbol",
|
79
|
+
"Modifier_Symbol",
|
80
|
+
"Other_Symbol",
|
81
|
+
"Space_Separator",
|
82
|
+
"Line_Separator",
|
83
|
+
"Paragraph_Separator",
|
84
|
+
"Control",
|
85
|
+
"Format",
|
86
|
+
"Surrogate",
|
87
|
+
"Private_Use",
|
88
|
+
"Unassigned"
|
89
|
+
};
|
90
|
+
|
91
|
+
enum EastAsianWidth {
|
92
|
+
w_N = 1, w_A, w_H, w_W, w_F, w_Na
|
93
|
+
};
|
94
|
+
|
95
|
+
struct unicode_data {
|
96
|
+
const int code;
|
97
|
+
const char* const canon;
|
98
|
+
const char* const compat;
|
99
|
+
const char* const uppercase;
|
100
|
+
const char* const lowercase;
|
101
|
+
const char* const titlecase;
|
102
|
+
const unsigned char combining_class;
|
103
|
+
const unsigned char exclusion;
|
104
|
+
const unsigned char general_category;
|
105
|
+
const unsigned char east_asian_width;
|
106
|
+
};
|
107
|
+
|
108
|
+
static const struct unicode_data unidata[] = {
|
109
|
+
EOS
|
110
|
+
|
111
|
+
TAIL=<<EOS
|
112
|
+
};
|
113
|
+
|
114
|
+
#endif
|
115
|
+
EOS
|
116
|
+
|
117
|
+
def hex2str(hex)
|
118
|
+
if hex.nil? || hex == ''
|
119
|
+
return [nil, nil]
|
120
|
+
end
|
121
|
+
canon = ""
|
122
|
+
compat = ""
|
123
|
+
chars = hex.split(" ")
|
124
|
+
if chars[0] =~ /^[0-9A-F]{4,6}$/
|
125
|
+
chars.each do |c|
|
126
|
+
canon << [c.hex].pack("U")
|
127
|
+
end
|
128
|
+
compat = canon
|
129
|
+
elsif chars[0] =~ /^<.+>$/
|
130
|
+
chars.shift
|
131
|
+
chars.each do |c|
|
132
|
+
compat << [c.hex].pack("U")
|
133
|
+
end
|
134
|
+
canon = nil
|
135
|
+
else
|
136
|
+
raise "unknown value: " + hex
|
137
|
+
end
|
138
|
+
[canon, compat]
|
139
|
+
end
|
140
|
+
|
141
|
+
def hex_or_nil(str)
|
142
|
+
return nil if str.nil? || str == ''
|
143
|
+
ret = ""
|
144
|
+
chars = str.split(" ")
|
145
|
+
chars.each do |c|
|
146
|
+
ret << [c.hex].pack("U")
|
147
|
+
end
|
148
|
+
return ret
|
149
|
+
end
|
150
|
+
|
151
|
+
def printstr(str)
|
152
|
+
return "NULL" if !str
|
153
|
+
ret = ""
|
154
|
+
str.each_byte do |c|
|
155
|
+
if c >= 32 && c < 127 && c != 34 && c != 92
|
156
|
+
ret << c
|
157
|
+
else
|
158
|
+
ret << format("\\%03o", c)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
return '"' + ret + '"'
|
162
|
+
end
|
163
|
+
|
164
|
+
if ARGV.length != 4
|
165
|
+
puts "Usage: #{$0} <UnicodeData.txt> <DerivedNormalizationProps.txt> <SpecialCasing.txt> <EastAsianWidth.txt>"
|
166
|
+
exit 0
|
167
|
+
end
|
168
|
+
|
169
|
+
## scan Composition Exclusions
|
170
|
+
exclusion = {}
|
171
|
+
open(ARGV[1]) do |f|
|
172
|
+
while l = f.gets
|
173
|
+
next if l =~ /^\#/ || l =~ /^$/
|
174
|
+
next if l !~ /Full_Composition_Exclusion/
|
175
|
+
code, = l.split(/\s/)
|
176
|
+
if code =~ /^[0-9A-F]+$/
|
177
|
+
code = code.hex
|
178
|
+
exclusion[code] = true
|
179
|
+
elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
|
180
|
+
# p [$1, $2]
|
181
|
+
scode = $1.hex
|
182
|
+
ecode = $2.hex
|
183
|
+
for code in scode..ecode
|
184
|
+
exclusion[code] = true
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
## scan Special Casing
|
191
|
+
casing = {}
|
192
|
+
open(ARGV[2]) do |f|
|
193
|
+
while l = f.gets
|
194
|
+
l.chomp!
|
195
|
+
next if l =~ /^\#/ || l =~ /^$/
|
196
|
+
l =~ /^(.*)#\s*(.*)$/
|
197
|
+
l = $1
|
198
|
+
comment = $2
|
199
|
+
code,lower,title,upper,cond = l.split(/;\s/)
|
200
|
+
next if cond
|
201
|
+
lower = nil if code == lower
|
202
|
+
title = nil if code == title
|
203
|
+
upper = nil if code == upper
|
204
|
+
code = code.hex
|
205
|
+
casing[code] = [hex_or_nil(lower), hex_or_nil(title), hex_or_nil(upper)]
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
## scan UnicodeData
|
210
|
+
udata = {}
|
211
|
+
range_data = []
|
212
|
+
open(ARGV[0]) do |f|
|
213
|
+
while l = f.gets
|
214
|
+
l.chomp!
|
215
|
+
code, charname, gencat, ccclass, bidicat,decomp,
|
216
|
+
dec, digit, num, mirror, uni1_0, comment, upcase,
|
217
|
+
lowcase, titlecase = l.split(";", 15);
|
218
|
+
code = code.hex
|
219
|
+
ccclass = ccclass.to_i
|
220
|
+
canon, compat = hex2str(decomp)
|
221
|
+
upcase = hex_or_nil(upcase)
|
222
|
+
lowcase = hex_or_nil(lowcase)
|
223
|
+
titlecase = hex_or_nil(titlecase)
|
224
|
+
udata[code] = [ccclass, canon, compat, upcase, lowcase, titlecase, gencat]
|
225
|
+
if charname =~ /^<(.*, (First|Last))>$/
|
226
|
+
charname = $1.upcase.gsub(/,? /, '_')
|
227
|
+
range_data << [charname, code]
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
## scan EastAsianWidth
|
233
|
+
ea_width = {}
|
234
|
+
open(ARGV[3]) do |f|
|
235
|
+
while l = f.gets
|
236
|
+
l.chomp!
|
237
|
+
next if l =~ /^\#/ || l =~ /^$/
|
238
|
+
l =~ /^(.*)\s+#\s*(.*)$/
|
239
|
+
l = $1
|
240
|
+
comment = $2
|
241
|
+
code,width = l.split(/;/)
|
242
|
+
if code =~ /\.\./
|
243
|
+
start_code, end_code = code.split('..')
|
244
|
+
start_code = start_code.hex
|
245
|
+
end_code = end_code.hex
|
246
|
+
(start_code..end_code).each do |code|
|
247
|
+
ea_width[code] = width
|
248
|
+
end
|
249
|
+
next
|
250
|
+
end
|
251
|
+
code = code.hex
|
252
|
+
ea_width[code] = width
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
print HEAD
|
257
|
+
range_data.each do |charname, code|
|
258
|
+
printf("#define %s\t(0x%04x)\n", charname, code)
|
259
|
+
end
|
260
|
+
|
261
|
+
print HEAD1
|
262
|
+
udata.sort.each do |code, data|
|
263
|
+
ccclass, canon, compat, upcase, lowcase, titlecase, gencat = data
|
264
|
+
## Exclusions
|
265
|
+
ex = 0
|
266
|
+
if exclusion[code] ## Script-specifics or Post Composition Version
|
267
|
+
ex = 1
|
268
|
+
elsif canon =~ /^.$/ ## Singltons
|
269
|
+
ex = 2
|
270
|
+
elsif !canon.nil?
|
271
|
+
starter = canon.unpack("U*")[0]
|
272
|
+
if udata[starter][0] != 0 ## Non-stater decompositions
|
273
|
+
ex = 3
|
274
|
+
end
|
275
|
+
end
|
276
|
+
## Special Casing
|
277
|
+
if casing[code]
|
278
|
+
lowcase = casing[code][0] if casing[code][0]
|
279
|
+
titlecase = casing[code][1] if casing[code][1]
|
280
|
+
upcase = casing[code][2] if casing[code][2]
|
281
|
+
end
|
282
|
+
width = 'N'
|
283
|
+
if ea_width[code]
|
284
|
+
width = ea_width[code]
|
285
|
+
end
|
286
|
+
|
287
|
+
printf(" { 0x%04x, %s, %s, %s, %s, %s, %d, %d, c_%s, w_%s }, \n",
|
288
|
+
code, printstr(canon),
|
289
|
+
printstr(compat), printstr(upcase), printstr(lowcase),
|
290
|
+
printstr(titlecase), ccclass, ex, gencat, width)
|
291
|
+
end
|
292
|
+
printf(" { -1, NULL, NULL, NULL, NULL, NULL, 0, 0, 0, 0 }\n")
|
293
|
+
print TAIL
|
data/tools/normtest.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
#! /usr/local/bin/ruby -KU
|
2
|
+
|
3
|
+
## Conformance test with NormaliztionTest.txt
|
4
|
+
## Copyrigth 2010 yoshidam
|
5
|
+
|
6
|
+
require 'unicode'
|
7
|
+
|
8
|
+
TESTFILE = "NormalizationTest.txt"
|
9
|
+
|
10
|
+
def from_hex(str)
|
11
|
+
ret = ""
|
12
|
+
chars = str.split(" ")
|
13
|
+
chars.each do |c|
|
14
|
+
ret << [c.hex].pack("U")
|
15
|
+
end
|
16
|
+
return ret
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_hex(str)
|
20
|
+
ret = ""
|
21
|
+
str = str.unpack('U*')
|
22
|
+
str.each do |c|
|
23
|
+
ret += sprintf("%04X ", c)
|
24
|
+
end
|
25
|
+
ret
|
26
|
+
end
|
27
|
+
|
28
|
+
open(TESTFILE) do |f|
|
29
|
+
while l = f.gets
|
30
|
+
next if l =~ /^#/
|
31
|
+
l.chomp
|
32
|
+
if l =~ /^@/
|
33
|
+
puts l
|
34
|
+
next
|
35
|
+
end
|
36
|
+
c1, c2, c3, c4, c5 = l.split(';')
|
37
|
+
code = c1
|
38
|
+
c1 = from_hex(c1)
|
39
|
+
c2 = from_hex(c2)
|
40
|
+
c3 = from_hex(c3)
|
41
|
+
c4 = from_hex(c4)
|
42
|
+
c5 = from_hex(c5)
|
43
|
+
## NFC TEST
|
44
|
+
if c2 == Unicode.nfc(c1) && c2 == Unicode.nfc(c2) &&
|
45
|
+
c2 == Unicode.nfc(c3) &&
|
46
|
+
c4 == Unicode.nfc(c4) && c4 == Unicode.nfc(c4)
|
47
|
+
##puts "NFC OK: " + code
|
48
|
+
else
|
49
|
+
puts "NFC NG: " + to_hex(c1)
|
50
|
+
printf(" c2=%s NFC(c1)=%s NFC(c2)=%s NFC(c3)=%s\n",
|
51
|
+
to_hex(c2),
|
52
|
+
to_hex(Unicode.nfc(c1)),
|
53
|
+
to_hex(Unicode.nfc(c2)),
|
54
|
+
to_hex(Unicode.nfc(c3)))
|
55
|
+
printf(" c4=%s NFC(c4)=%s NFC(c5)=%s\n",
|
56
|
+
to_hex(c4),
|
57
|
+
to_hex(Unicode.nfc(c4)),
|
58
|
+
to_hex(Unicode.nfc(c5)))
|
59
|
+
end
|
60
|
+
|
61
|
+
## NFD TEST
|
62
|
+
if c3 == Unicode.nfd(c1) && c3 == Unicode.nfd(c2) &&
|
63
|
+
c3 == Unicode.nfd(c3) &&
|
64
|
+
c5 == Unicode.nfd(c4) && c5 == Unicode.nfd(c5)
|
65
|
+
##puts "NFD OK: " + code
|
66
|
+
else
|
67
|
+
puts "NFD NG: " + to_hex(c1)
|
68
|
+
printf(" c3=%s NFD(c1)=%s NFD(c2)=%s NFD(c3)=%s\n",
|
69
|
+
to_hex(c3),
|
70
|
+
to_hex(Unicode.nfd(c1)),
|
71
|
+
to_hex(Unicode.nfd(c2)),
|
72
|
+
to_hex(Unicode.nfd(c3)))
|
73
|
+
printf(" c5=%s NFD(c4)=%s NFD(c5)=%s\n",
|
74
|
+
to_hex(c5),
|
75
|
+
to_hex(Unicode.nfd(c4)),
|
76
|
+
to_hex(Unicode.nfd(c5)))
|
77
|
+
end
|
78
|
+
|
79
|
+
## NFKC TEST
|
80
|
+
if c4 == Unicode.nfkc(c1) && c4 == Unicode.nfkc(c2) &&
|
81
|
+
c4 == Unicode.nfkc(c3) &&
|
82
|
+
c4 == Unicode.nfkc(c4) && c4 == Unicode.nfkc(c5)
|
83
|
+
##puts "NFKC OK: " + code
|
84
|
+
else
|
85
|
+
puts "NFKC NG: " + to_hex(c1)
|
86
|
+
printf(" c4=%s NFKC(c1)=%s NFKC(c2)=%s NFKC(c3)=%s NFKC(c4)=%s NFKC(c5)=%s\n",
|
87
|
+
to_hex(c4),
|
88
|
+
to_hex(Unicode.nfkc(c1)),
|
89
|
+
to_hex(Unicode.nfkc(c2)),
|
90
|
+
to_hex(Unicode.nfkc(c3)),
|
91
|
+
to_hex(Unicode.nfkc(c4)),
|
92
|
+
to_hex(Unicode.nfkc(c5)))
|
93
|
+
end
|
94
|
+
|
95
|
+
## NFKD TEST
|
96
|
+
if c5 == Unicode.nfkd(c1) && c5 == Unicode.nfkd(c2) &&
|
97
|
+
c5 == Unicode.nfkd(c3) &&
|
98
|
+
c5 == Unicode.nfkd(c4) && c5 == Unicode.nfkd(c5)
|
99
|
+
##puts "NFKD OK: " + code
|
100
|
+
else
|
101
|
+
puts "NFKD NG: " + to_hex(c1)
|
102
|
+
printf(" c5=%s NFKD(c1)=%s NFKD(c2)=%s NFKD(c3)=%s NFKD(c4)=%s NFKD(c5)=%s\n",
|
103
|
+
to_hex(c5),
|
104
|
+
to_hex(Unicode.nfkd(c1)),
|
105
|
+
to_hex(Unicode.nfkd(c2)),
|
106
|
+
to_hex(Unicode.nfkd(c3)),
|
107
|
+
to_hex(Unicode.nfkd(c4)),
|
108
|
+
to_hex(Unicode.nfkd(c5)))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
data/unicode.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{unicode}
|
5
|
+
s.version = "0.4.4"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = [%q{Yoshida Masato}]
|
9
|
+
s.date = %q{2013-02-07}
|
10
|
+
s.email = %q{yoshidam@yoshidam.net}
|
11
|
+
s.licenses = %w[Ruby]
|
12
|
+
s.extensions = %w[ext/unicode/extconf.rb]
|
13
|
+
s.extra_rdoc_files = [%q{README}]
|
14
|
+
s.files = %w[
|
15
|
+
README Rakefile unicode.gemspec lib/unicode.rb
|
16
|
+
test/test.rb tools/README tools/mkunidata.rb tools/normtest.rb
|
17
|
+
ext/unicode/extconf.rb ext/unicode/unicode.c ext/unicode/unidata.map
|
18
|
+
ext/unicode/ustring.c ext/unicode/ustring.h ext/unicode/wstring.c ext/unicode/wstring.h
|
19
|
+
]
|
20
|
+
s.homepage = %q{http://www.yoshidam.net/Ruby.html#unicode}
|
21
|
+
s.require_paths = [%q{lib}]
|
22
|
+
s.rubygems_version = %q{1.8.6}
|
23
|
+
s.summary = %q{Unicode normalization library.}
|
24
|
+
s.description = %q{Unicode normalization library.}
|
25
|
+
|
26
|
+
if s.respond_to? :specification_version then
|
27
|
+
s.specification_version = 3
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
metadata
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.4
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Yoshida Masato
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-02-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Unicode normalization library.
|
14
|
+
email: yoshidam@yoshidam.net
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files:
|
18
|
+
- README
|
19
|
+
files:
|
20
|
+
- README
|
21
|
+
- Rakefile
|
22
|
+
- lib/unicode.rb
|
23
|
+
- test/test.rb
|
24
|
+
- tools/README
|
25
|
+
- tools/mkunidata.rb
|
26
|
+
- tools/normtest.rb
|
27
|
+
- unicode.gemspec
|
28
|
+
homepage: http://www.yoshidam.net/Ruby.html#unicode
|
29
|
+
licenses:
|
30
|
+
- Ruby
|
31
|
+
metadata: {}
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
requirements: []
|
47
|
+
rubyforge_project:
|
48
|
+
rubygems_version: 2.4.4
|
49
|
+
signing_key:
|
50
|
+
specification_version: 3
|
51
|
+
summary: Unicode normalization library.
|
52
|
+
test_files: []
|