camertron-eprun 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d9ad241156fd22c9d731cd199bcbed44c536335b
4
+ data.tar.gz: 336265b2dab02fbde594fbbd096b5658a4121be8
5
+ SHA512:
6
+ metadata.gz: 7cc4b9b77085815f16332187963520cf2155eb6f01fab8807209f0c685a8bd9c3f0f0549a83d6af4f336c1adb3164c27e5e97f906c495d8aed7fe88564ca7d4a
7
+ data.tar.gz: 9d03fab650391d0baf79c481b6ae1f3fbcf32be5a67b57edf310ecf96200ceeadc58a523810dec54680c46e72ac47d61005116dd8513d8812ce92f4b34d24f7f
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem "rake"
7
+ gem "pry-nav"
8
+ gem 'unicode'
9
+ gem 'unf'
10
+ # gem 'unicode_utils'
11
+ gem 'activesupport'
12
+ end
13
+
14
+ group :test do
15
+ gem "rspec"
16
+ gem "rr"
17
+ end
@@ -0,0 +1,4 @@
1
+ == 1.0.0
2
+
3
+ * Repo converted into gem.
4
+ * Added MRI 1.8 compatibility.
data/LICENSE ADDED
@@ -0,0 +1,6 @@
1
+ Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
2
+ available under the same licence as Ruby itself
3
+ (see http://www.ruby-lang.org/en/LICENSE.txt)
4
+
5
+ The files in the 'data' subdirectory are © Unicode Consortium,
6
+ downloaded from http://www.unicode.org/Public/UCD/latest/ucd/.
@@ -0,0 +1,63 @@
1
+ Efficient Pure Ruby Unicode Normalization (eprun)
2
+ =================================================
3
+
4
+ (pronounced e-prune)
5
+
6
+ The Talk
7
+ --------
8
+
9
+ Please see the
10
+ [Internationalization & Unicode Conference 37](http://www.unicodeconference.org/)
11
+ talk on
12
+ [Implementing Normalization in Pure Ruby - the Fast and Easy Way](http://www.sw.it.aoyama.ac.jp/2013/pub/RubyNorm/).
13
+
14
+ Directories and Files
15
+ ---------------------
16
+
17
+ * lib/normalize.rb: The core normalization code.
18
+ * lib/string_normalize.rm: String#normalize.
19
+ * lib/generate.rb: Generation script, generates lib/normalize_tables.rb
20
+ from data/UnicodeData.txt and data/CompositionExclusions.txt.
21
+ This needs to be run only once when updating to a new Unicode version.
22
+ * lib/normalize_tables.rb: Data used for normalization,
23
+ automatically generated by lib/generate.rb.
24
+ * data/: All three files in this directory are downloaded from the
25
+ [Unicode Character Database](http://www.unicode.org/Public/UCD/latest/ucd/).
26
+ They are currently at Unicode version 6.3. They need to be updated for
27
+ a newer Unicode version (happens about once a year).
28
+ * test/test_normalize.rb: Tests for lib/string_normalize.rb,
29
+ using data/NormalizationTest.txt.
30
+ * benchmark/benchmark.rb: Runs the benchmark with example text files.
31
+ Automatically checks for existing gems/libraries; if e.g. the unicode_util
32
+ gem is not available, that part of the benchmark is skipped.
33
+ This also applies to eprun, which will not be run on Ruby 1.8.
34
+ * benchmark/Deutsch_.txt, Japanese_.txt, Korean_.txt, Vietnamese_.txt:
35
+ example texts extracted from random Wikipedia pages
36
+ (see http://en.wikipedia.org/wiki/Wikipedia:Random).
37
+ The languages are choosen based on number of characters affected
38
+ by normalization (Deutsch < Japanese < Vietnamese < Korean).
39
+ These files have somewhat differing lengths,
40
+ so the results cannot directly be compared across languages.
41
+ Adding other files with ending "_.txt" will include them in
42
+ the benchmark.
43
+ * benchmark/benchmark_results.rb:
44
+ Results of benchmark for eprun, unicode_utils,
45
+ ActiveSupport::Multibyte (version 3.0.0), twitter_cldr, and the unicode gem.
46
+ Eprun, unicode_utils, and unicode normalizations are run 100 times each,
47
+ ActiveSupport::Multibyte is run 10 times each, and
48
+ twitter_cldr is run only 1 time (didn't want to wait any longer).
49
+ * benchmark/benchmark_results_jruby.txt:
50
+ Results of benchmark when using jruby (excludes unicode gem),
51
+ version 1.7.4 (1.9.3p392, 2013-05-16 2390d3b on Java HotSpot(TM) Client VM 1.7.0_07-b10 [Windows 7-x86]).
52
+ * benchmark/benchmark.pl: Runs the benchmark using Perl, both with
53
+ xsub (i.e. C) version (run 100 times) and pure Perl version
54
+ (run 10 times).
55
+ * benchmark/benchmark_results_pl.txt: Results of Perl benchmarks.
56
+
57
+ TODOs and Ideas
58
+ ---------------
59
+ * Publish as a gem, or several gems.
60
+ * Deal better with encodings other than UTF-8.
61
+ * Add methods such as String#nfc, String#nfd,...
62
+ * Add methods for normalization variants.
63
+ * See [talk](http://www.sw.it.aoyama.ac.jp/2013/pub/RubyNorm/) for more.
@@ -0,0 +1,40 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ ROOT_DIR = Pathname.new(File.join(File.dirname(__FILE__)))
8
+ $:.push(ROOT_DIR.to_s)
9
+
10
+ require 'eprun/helpers'
11
+ require 'tasks/erb_template'
12
+ require 'tasks/tables_generator'
13
+
14
+ require 'rubygems/package_task'
15
+
16
+ task :default => :test
17
+ Bundler::GemHelper.install_tasks
18
+
19
+ task :test do
20
+ require 'test/unit'
21
+ files = Dir.glob("./test/test_*.rb")
22
+ runner = Test::Unit::AutoRunner.new(true)
23
+ runner.process_args(files)
24
+ runner.run
25
+ end
26
+
27
+ task :generate_tables do
28
+ EprunTasks::TablesGenerator.new(
29
+ ROOT_DIR.join("data").to_s,
30
+ ROOT_DIR.join("lib", Eprun.require_path).to_s
31
+ ).generate
32
+ end
33
+
34
+ task :benchmark do
35
+ require 'eprun'
36
+ require ROOT_DIR.join("benchmark/benchmark").to_s
37
+
38
+ Eprun.enable_core_extensions!
39
+ EprunTasks::Benchmarks.new(ROOT_DIR.join("benchmark".to_s)).run
40
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ require "eprun/helpers"
8
+ Eprun.require_file("tables")
9
+ Eprun.require_file("normalize")
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class String
8
+ def normalize(form = :nfc)
9
+ Eprun.normalize(self, form)
10
+ end
11
+
12
+ def normalize!(form = :nfc)
13
+ replace(self.normalize(form))
14
+ end
15
+
16
+ def normalized?(form = :nfc)
17
+ Eprun.normalized?(self, form)
18
+ end
19
+ end
20
+
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class Eprun
8
+ class << self
9
+
10
+ def enable_core_extensions!
11
+ require 'eprun/core_ext/string' unless "".respond_to?(:normalize)
12
+ end
13
+
14
+ def ruby18?
15
+ RUBY_VERSION >= "1.8.0" && RUBY_VERSION < "1.9.0"
16
+ end
17
+
18
+ def require_path
19
+ ruby18? ? "eprun/ruby18" : "eprun"
20
+ end
21
+
22
+ def require_file(file)
23
+ require File.join(require_path, file)
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class Eprun
8
+ class << self
9
+
10
+ # Constant for max hash capacity to avoid DoS attack
11
+ MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
12
+
13
+ ## Regular Expressions and Hash Constants
14
+ REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
15
+ REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
16
+ REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
17
+
18
+ NF_HASH_D = Hash.new do |hash, key|
19
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
20
+ hash[key] = Eprun.nfd_one(key)
21
+ end
22
+
23
+ NF_HASH_C = Hash.new do |hash, key|
24
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
25
+ hash[key] = Eprun.nfc_one(key)
26
+ end
27
+
28
+ NF_HASH_K = Hash.new do |hash, key|
29
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
30
+ hash[key] = Eprun.nfkd_one(key)
31
+ end
32
+
33
+ def nf_hash_d
34
+ NF_HASH_D
35
+ end
36
+
37
+ def nf_hash_c
38
+ NF_HASH_C
39
+ end
40
+
41
+ def nf_hash_k
42
+ NF_HASH_K
43
+ end
44
+
45
+ ## Constants For Hangul
46
+ SBASE = 0xAC00
47
+ LBASE = 0x1100
48
+ VBASE = 0x1161
49
+ TBASE = 0x11A7
50
+ LCOUNT = 19
51
+ VCOUNT = 21
52
+ TCOUNT = 28
53
+ NCOUNT = VCOUNT * TCOUNT
54
+ SCOUNT = LCOUNT * NCOUNT
55
+
56
+
57
+ ## Hangul Algorithm
58
+ def hangul_decomp_one(target)
59
+ sIndex = target.ord - SBASE
60
+ return target if sIndex < 0 || sIndex >= SCOUNT
61
+ l = LBASE + sIndex / NCOUNT
62
+ v = VBASE + (sIndex % NCOUNT) / TCOUNT
63
+ t = TBASE + sIndex % TCOUNT
64
+ (t == TBASE ? [l, v] : [l, v, t]).pack('U*') + target[1..-1]
65
+ end
66
+
67
+ def hangul_comp_one(string)
68
+ length = string.length
69
+ in_range = length > 1 &&
70
+ 0 <= (lead = string[0].ord - LBASE) &&
71
+ lead < LCOUNT &&
72
+ 0 <= (vowel = string[1].ord - VBASE) &&
73
+ vowel < VCOUNT
74
+
75
+ if in_range
76
+ lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
77
+ if length > 2 && 0 <= (trail = string[2].ord - TBASE) && trail < TCOUNT
78
+ (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
79
+ else
80
+ lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
81
+ end
82
+ else
83
+ string
84
+ end
85
+ end
86
+
87
+ ## Canonical Ordering
88
+ def canonical_ordering_one(string)
89
+ sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
90
+ (sorting.length - 2).downto(0) do |i| # bubble sort
91
+ (0..i).each do |j|
92
+ later_class = sorting[j + 1].last
93
+ if 0 < later_class && later_class < sorting[j].last
94
+ sorting[j], sorting[j + 1] = sorting[j + 1], sorting[j]
95
+ end
96
+ end
97
+ end
98
+ sorting.collect(&:first).join
99
+ end
100
+
101
+ ## Normalization Forms for Patterns (not whole Strings)
102
+ def nfd_one(string)
103
+ string = string.dup
104
+ (0...string.length).each do |position|
105
+ if decomposition = DECOMPOSITION_TABLE[string[position]]
106
+ string[position] = decomposition
107
+ end
108
+ end
109
+ canonical_ordering_one(hangul_decomp_one(string))
110
+ end
111
+
112
+ def nfkd_one(string)
113
+ string = string.dup
114
+ position = 0
115
+ while position < string.length
116
+ if decomposition = KOMPATIBLE_TABLE[string[position]]
117
+ string[position] = decomposition
118
+ else
119
+ position += 1
120
+ end
121
+ end
122
+ string
123
+ end
124
+
125
+ def nfc_one(string)
126
+ nfd_string = nfd_one string
127
+ start = nfd_string[0]
128
+ last_class = CLASS_TABLE[start] - 1
129
+ accents = ''
130
+ nfd_string[1..-1].each_char do |accent|
131
+ accent_class = CLASS_TABLE[accent]
132
+ if last_class < accent_class && composite = COMPOSITION_TABLE[start+accent]
133
+ start = composite
134
+ else
135
+ accents += accent
136
+ last_class = accent_class
137
+ end
138
+ end
139
+ hangul_comp_one(start + accents)
140
+ end
141
+
142
+ def normalize(string, form = :nfc)
143
+ encoding = string.encoding
144
+ if encoding == Encoding::UTF_8
145
+ case form
146
+ when :nfc then
147
+ string.gsub(REGEXP_C, NF_HASH_C)
148
+ when :nfd then
149
+ string.gsub(REGEXP_D, NF_HASH_D)
150
+ when :nfkc then
151
+ string.gsub(REGEXP_K, NF_HASH_K).gsub(REGEXP_C, NF_HASH_C)
152
+ when :nfkd then
153
+ string.gsub(REGEXP_K, NF_HASH_K).gsub(REGEXP_D, NF_HASH_D)
154
+ else
155
+ raise ArgumentError, "Invalid normalization form #{form}."
156
+ end
157
+ else
158
+ normalize(string.encode(Encoding::UTF_8), form).encode(encoding)
159
+ end
160
+ end
161
+
162
+ def normalized?(string, form = :nfc)
163
+ string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
164
+ case form
165
+ when :nfc then
166
+ string.scan(REGEXP_C) do |match|
167
+ return false if NF_HASH_C[match] != match
168
+ end
169
+ true
170
+ when :nfd then
171
+ string.scan(REGEXP_D) do |match|
172
+ return false if NF_HASH_D[match] != match
173
+ end
174
+ true
175
+ when :nfkc then
176
+ normalized?(string, :nfc) && string !~ REGEXP_K
177
+ when :nfkd then
178
+ normalized?(string, :nfd) && string !~ REGEXP_K
179
+ else
180
+ raise ArgumentError, "Invalid normalization form #{form}."
181
+ end
182
+ end
183
+
184
+ end
185
+ end
@@ -0,0 +1,198 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class Eprun
8
+ class << self
9
+
10
+ ## Constant for max hash capacity to avoid DoS attack
11
+ MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
12
+
13
+ ## Regular Expressions and Hash Constants
14
+ REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
15
+ REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
16
+ REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
17
+
18
+ NF_HASH_D = Hash.new do |hash, key|
19
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
20
+ hash[key] = Eprun.nfd_one(key).pack("U*")
21
+ end
22
+
23
+ NF_HASH_C = Hash.new do |hash, key|
24
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
25
+ hash[key] = Eprun.nfc_one(key).pack("U*")
26
+ end
27
+
28
+ NF_HASH_K = Hash.new do |hash, key|
29
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
30
+ hash[key] = Eprun.nfkd_one(key).pack("U*")
31
+ end
32
+
33
+ def nf_hash_d
34
+ NF_HASH_D
35
+ end
36
+
37
+ def nf_hash_c
38
+ NF_HASH_C
39
+ end
40
+
41
+ def nf_hash_k
42
+ NF_HASH_K
43
+ end
44
+
45
+ ## Constants For Hangul
46
+ SBASE = 0xAC00
47
+ LBASE = 0x1100
48
+ VBASE = 0x1161
49
+ TBASE = 0x11A7
50
+ LCOUNT = 19
51
+ VCOUNT = 21
52
+ TCOUNT = 28
53
+ NCOUNT = VCOUNT * TCOUNT
54
+ SCOUNT = LCOUNT * NCOUNT
55
+
56
+ def get_codepoints(source)
57
+ if source.is_a?(Array)
58
+ source
59
+ elsif source.is_a?(String)
60
+ source.unpack("U*")
61
+ else
62
+ raise ArgumentError, "Source must be a string or an array."
63
+ end
64
+ end
65
+
66
+ ## Hangul Algorithm
67
+ def hangul_decomp_one(target)
68
+ cps = get_codepoints(target)
69
+ sIndex = cps.first - SBASE
70
+ return target if sIndex < 0 || sIndex >= SCOUNT
71
+ l = LBASE + sIndex / NCOUNT
72
+ v = VBASE + (sIndex % NCOUNT) / TCOUNT
73
+ t = TBASE + sIndex % TCOUNT
74
+ (t == TBASE ? [l, v] : [l, v, t]) + cps[1..-1]
75
+ end
76
+
77
+ def hangul_comp_one(string)
78
+ cps = get_codepoints(string)
79
+ length = cps.length
80
+
81
+ in_range = length > 1 &&
82
+ 0 <= (lead = cps[0] - LBASE) &&
83
+ lead < LCOUNT &&
84
+ 0 <= (vowel = cps[1] - VBASE) &&
85
+ vowel < VCOUNT
86
+
87
+ if in_range
88
+ lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
89
+ if length > 2 && 0 <= (trail = cps[2] - TBASE) && trail < TCOUNT
90
+ [lead_vowel + trail] + cps[3..-1]
91
+ else
92
+ [lead_vowel] + cps[2..-1]
93
+ end
94
+ else
95
+ string
96
+ end
97
+ end
98
+
99
+ ## Canonical Ordering
100
+ def canonical_ordering_one(string)
101
+ cps = get_codepoints(string)
102
+ sorting = cps.collect { |c| [c, CLASS_TABLE[c]] }
103
+
104
+ (sorting.length - 2).downto(0) do |i| # bubble sort
105
+ (0..i).each do |j|
106
+ later_class = sorting[j + 1].last
107
+ if 0 < later_class && later_class < sorting[j].last
108
+ sorting[j], sorting[j + 1] = sorting[j + 1], sorting[j]
109
+ end
110
+ end
111
+ end
112
+ sorting.collect(&:first)
113
+ end
114
+
115
+ ## Normalization Forms for Patterns (not whole Strings)
116
+ def nfd_one(string)
117
+ cps = get_codepoints(string)
118
+ cps = cps.inject([]) do |ret, cp|
119
+ if decomposition = DECOMPOSITION_TABLE[cp]
120
+ ret += decomposition
121
+ else
122
+ ret << cp
123
+ end
124
+ end
125
+
126
+ canonical_ordering_one(hangul_decomp_one(cps))
127
+ end
128
+
129
+ def nfkd_one(string)
130
+ cps = get_codepoints(string)
131
+ final_cps = []
132
+ position = 0
133
+ while position < cps.length
134
+ if decomposition = KOMPATIBLE_TABLE[cps[position]]
135
+ final_cps += nfkd_one(decomposition)
136
+ else
137
+ final_cps << cps[position]
138
+ end
139
+ position += 1
140
+ end
141
+ final_cps
142
+ end
143
+
144
+ def nfc_one(string)
145
+ nfd_cps = nfd_one(string)
146
+ start = nfd_cps[0]
147
+ last_class = CLASS_TABLE[start] - 1
148
+ accents = []
149
+ nfd_cps[1..-1].each do |accent_cp|
150
+ accent_class = CLASS_TABLE[accent_cp]
151
+ if last_class < accent_class && composite = COMPOSITION_TABLE[[start, accent_cp]]
152
+ start = composite
153
+ else
154
+ accents << accent_cp
155
+ last_class = accent_class
156
+ end
157
+ end
158
+ hangul_comp_one([start] + accents)
159
+ end
160
+
161
+ def normalize(string, form = :nfc)
162
+ case form
163
+ when :nfc then
164
+ string.gsub(REGEXP_C) { |s| NF_HASH_C[s] }
165
+ when :nfd then
166
+ string.gsub(REGEXP_D) { |s| NF_HASH_D[s] }
167
+ when :nfkc then
168
+ string.gsub(REGEXP_K) { |s| NF_HASH_K[s] }.gsub(REGEXP_C) { |s| NF_HASH_C[s] }
169
+ when :nfkd then
170
+ string.gsub(REGEXP_K) { |s| NF_HASH_K[s] }.gsub(REGEXP_D) { |s| NF_HASH_D[s] }
171
+ else
172
+ raise ArgumentError, "Invalid normalization form #{form}."
173
+ end
174
+ end
175
+
176
+ def normalized?(string, form = :nfc)
177
+ case form
178
+ when :nfc then
179
+ string.scan REGEXP_C do |match|
180
+ return false if NF_HASH_C[match] != match
181
+ end
182
+ true
183
+ when :nfd then
184
+ string.scan REGEXP_D do |match|
185
+ return false if NF_HASH_D[match] != match
186
+ end
187
+ true
188
+ when :nfkc then
189
+ normalized?(string, :nfc) && string !~ REGEXP_K
190
+ when :nfkd then
191
+ normalized?(string, :nfd) && string !~ REGEXP_K
192
+ else
193
+ raise ArgumentError, "Invalid normalization form #{form}."
194
+ end
195
+ end
196
+
197
+ end
198
+ end # class