camertron-eprun 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d9ad241156fd22c9d731cd199bcbed44c536335b
4
+ data.tar.gz: 336265b2dab02fbde594fbbd096b5658a4121be8
5
+ SHA512:
6
+ metadata.gz: 7cc4b9b77085815f16332187963520cf2155eb6f01fab8807209f0c685a8bd9c3f0f0549a83d6af4f336c1adb3164c27e5e97f906c495d8aed7fe88564ca7d4a
7
+ data.tar.gz: 9d03fab650391d0baf79c481b6ae1f3fbcf32be5a67b57edf310ecf96200ceeadc58a523810dec54680c46e72ac47d61005116dd8513d8812ce92f4b34d24f7f
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem "rake"
7
+ gem "pry-nav"
8
+ gem 'unicode'
9
+ gem 'unf'
10
+ # gem 'unicode_utils'
11
+ gem 'activesupport'
12
+ end
13
+
14
+ group :test do
15
+ gem "rspec"
16
+ gem "rr"
17
+ end
@@ -0,0 +1,4 @@
1
+ == 1.0.0
2
+
3
+ * Repo converted into gem.
4
+ * Added MRI 1.8 compatibility.
data/LICENSE ADDED
@@ -0,0 +1,6 @@
1
+ Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
2
+ available under the same licence as Ruby itself
3
+ (see http://www.ruby-lang.org/en/LICENSE.txt)
4
+
5
+ The files in the 'data' subdirectory are © Unicode Consortium,
6
+ downloaded from http://www.unicode.org/Public/UCD/latest/ucd/.
@@ -0,0 +1,63 @@
1
+ Efficient Pure Ruby Unicode Normalization (eprun)
2
+ =================================================
3
+
4
+ (pronounced e-prune)
5
+
6
+ The Talk
7
+ --------
8
+
9
+ Please see the
10
+ [Internationalization & Unicode Conference 37](http://www.unicodeconference.org/)
11
+ talk on
12
+ [Implementing Normalization in Pure Ruby - the Fast and Easy Way](http://www.sw.it.aoyama.ac.jp/2013/pub/RubyNorm/).
13
+
14
+ Directories and Files
15
+ ---------------------
16
+
17
+ * lib/normalize.rb: The core normalization code.
18
+ * lib/string_normalize.rm: String#normalize.
19
+ * lib/generate.rb: Generation script, generates lib/normalize_tables.rb
20
+ from data/UnicodeData.txt and data/CompositionExclusions.txt.
21
+ This needs to be run only once when updating to a new Unicode version.
22
+ * lib/normalize_tables.rb: Data used for normalization,
23
+ automatically generated by lib/generate.rb.
24
+ * data/: All three files in this directory are downloaded from the
25
+ [Unicode Character Database](http://www.unicode.org/Public/UCD/latest/ucd/).
26
+ They are currently at Unicode version 6.3. They need to be updated for
27
+ a newer Unicode version (happens about once a year).
28
+ * test/test_normalize.rb: Tests for lib/string_normalize.rb,
29
+ using data/NormalizationTest.txt.
30
+ * benchmark/benchmark.rb: Runs the benchmark with example text files.
31
+ Automatically checks for existing gems/libraries; if e.g. the unicode_util
32
+ gem is not available, that part of the benchmark is skipped.
33
+ This also applies to eprun, which will not be run on Ruby 1.8.
34
+ * benchmark/Deutsch_.txt, Japanese_.txt, Korean_.txt, Vietnamese_.txt:
35
+ example texts extracted from random Wikipedia pages
36
+ (see http://en.wikipedia.org/wiki/Wikipedia:Random).
37
+ The languages are choosen based on number of characters affected
38
+ by normalization (Deutsch < Japanese < Vietnamese < Korean).
39
+ These files have somewhat differing lengths,
40
+ so the results cannot directly be compared across languages.
41
+ Adding other files with ending "_.txt" will include them in
42
+ the benchmark.
43
+ * benchmark/benchmark_results.rb:
44
+ Results of benchmark for eprun, unicode_utils,
45
+ ActiveSupport::Multibyte (version 3.0.0), twitter_cldr, and the unicode gem.
46
+ Eprun, unicode_utils, and unicode normalizations are run 100 times each,
47
+ ActiveSupport::Multibyte is run 10 times each, and
48
+ twitter_cldr is run only 1 time (didn't want to wait any longer).
49
+ * benchmark/benchmark_results_jruby.txt:
50
+ Results of benchmark when using jruby (excludes unicode gem),
51
+ version 1.7.4 (1.9.3p392, 2013-05-16 2390d3b on Java HotSpot(TM) Client VM 1.7.0_07-b10 [Windows 7-x86]).
52
+ * benchmark/benchmark.pl: Runs the benchmark using Perl, both with
53
+ xsub (i.e. C) version (run 100 times) and pure Perl version
54
+ (run 10 times).
55
+ * benchmark/benchmark_results_pl.txt: Results of Perl benchmarks.
56
+
57
+ TODOs and Ideas
58
+ ---------------
59
+ * Publish as a gem, or several gems.
60
+ * Deal better with encodings other than UTF-8.
61
+ * Add methods such as String#nfc, String#nfd,...
62
+ * Add methods for normalization variants.
63
+ * See [talk](http://www.sw.it.aoyama.ac.jp/2013/pub/RubyNorm/) for more.
@@ -0,0 +1,40 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ ROOT_DIR = Pathname.new(File.join(File.dirname(__FILE__)))
8
+ $:.push(ROOT_DIR.to_s)
9
+
10
+ require 'eprun/helpers'
11
+ require 'tasks/erb_template'
12
+ require 'tasks/tables_generator'
13
+
14
+ require 'rubygems/package_task'
15
+
16
+ task :default => :test
17
+ Bundler::GemHelper.install_tasks
18
+
19
+ task :test do
20
+ require 'test/unit'
21
+ files = Dir.glob("./test/test_*.rb")
22
+ runner = Test::Unit::AutoRunner.new(true)
23
+ runner.process_args(files)
24
+ runner.run
25
+ end
26
+
27
+ task :generate_tables do
28
+ EprunTasks::TablesGenerator.new(
29
+ ROOT_DIR.join("data").to_s,
30
+ ROOT_DIR.join("lib", Eprun.require_path).to_s
31
+ ).generate
32
+ end
33
+
34
+ task :benchmark do
35
+ require 'eprun'
36
+ require ROOT_DIR.join("benchmark/benchmark").to_s
37
+
38
+ Eprun.enable_core_extensions!
39
+ EprunTasks::Benchmarks.new(ROOT_DIR.join("benchmark".to_s)).run
40
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ require "eprun/helpers"
8
+ Eprun.require_file("tables")
9
+ Eprun.require_file("normalize")
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class String
8
+ def normalize(form = :nfc)
9
+ Eprun.normalize(self, form)
10
+ end
11
+
12
+ def normalize!(form = :nfc)
13
+ replace(self.normalize(form))
14
+ end
15
+
16
+ def normalized?(form = :nfc)
17
+ Eprun.normalized?(self, form)
18
+ end
19
+ end
20
+
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class Eprun
8
+ class << self
9
+
10
+ def enable_core_extensions!
11
+ require 'eprun/core_ext/string' unless "".respond_to?(:normalize)
12
+ end
13
+
14
+ def ruby18?
15
+ RUBY_VERSION >= "1.8.0" && RUBY_VERSION < "1.9.0"
16
+ end
17
+
18
+ def require_path
19
+ ruby18? ? "eprun/ruby18" : "eprun"
20
+ end
21
+
22
+ def require_file(file)
23
+ require File.join(require_path, file)
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class Eprun
8
+ class << self
9
+
10
+ # Constant for max hash capacity to avoid DoS attack
11
+ MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
12
+
13
+ ## Regular Expressions and Hash Constants
14
+ REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
15
+ REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
16
+ REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
17
+
18
+ NF_HASH_D = Hash.new do |hash, key|
19
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
20
+ hash[key] = Eprun.nfd_one(key)
21
+ end
22
+
23
+ NF_HASH_C = Hash.new do |hash, key|
24
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
25
+ hash[key] = Eprun.nfc_one(key)
26
+ end
27
+
28
+ NF_HASH_K = Hash.new do |hash, key|
29
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
30
+ hash[key] = Eprun.nfkd_one(key)
31
+ end
32
+
33
+ def nf_hash_d
34
+ NF_HASH_D
35
+ end
36
+
37
+ def nf_hash_c
38
+ NF_HASH_C
39
+ end
40
+
41
+ def nf_hash_k
42
+ NF_HASH_K
43
+ end
44
+
45
+ ## Constants For Hangul
46
+ SBASE = 0xAC00
47
+ LBASE = 0x1100
48
+ VBASE = 0x1161
49
+ TBASE = 0x11A7
50
+ LCOUNT = 19
51
+ VCOUNT = 21
52
+ TCOUNT = 28
53
+ NCOUNT = VCOUNT * TCOUNT
54
+ SCOUNT = LCOUNT * NCOUNT
55
+
56
+
57
+ ## Hangul Algorithm
58
+ def hangul_decomp_one(target)
59
+ sIndex = target.ord - SBASE
60
+ return target if sIndex < 0 || sIndex >= SCOUNT
61
+ l = LBASE + sIndex / NCOUNT
62
+ v = VBASE + (sIndex % NCOUNT) / TCOUNT
63
+ t = TBASE + sIndex % TCOUNT
64
+ (t == TBASE ? [l, v] : [l, v, t]).pack('U*') + target[1..-1]
65
+ end
66
+
67
+ def hangul_comp_one(string)
68
+ length = string.length
69
+ in_range = length > 1 &&
70
+ 0 <= (lead = string[0].ord - LBASE) &&
71
+ lead < LCOUNT &&
72
+ 0 <= (vowel = string[1].ord - VBASE) &&
73
+ vowel < VCOUNT
74
+
75
+ if in_range
76
+ lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
77
+ if length > 2 && 0 <= (trail = string[2].ord - TBASE) && trail < TCOUNT
78
+ (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
79
+ else
80
+ lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
81
+ end
82
+ else
83
+ string
84
+ end
85
+ end
86
+
87
+ ## Canonical Ordering
88
+ def canonical_ordering_one(string)
89
+ sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
90
+ (sorting.length - 2).downto(0) do |i| # bubble sort
91
+ (0..i).each do |j|
92
+ later_class = sorting[j + 1].last
93
+ if 0 < later_class && later_class < sorting[j].last
94
+ sorting[j], sorting[j + 1] = sorting[j + 1], sorting[j]
95
+ end
96
+ end
97
+ end
98
+ sorting.collect(&:first).join
99
+ end
100
+
101
+ ## Normalization Forms for Patterns (not whole Strings)
102
+ def nfd_one(string)
103
+ string = string.dup
104
+ (0...string.length).each do |position|
105
+ if decomposition = DECOMPOSITION_TABLE[string[position]]
106
+ string[position] = decomposition
107
+ end
108
+ end
109
+ canonical_ordering_one(hangul_decomp_one(string))
110
+ end
111
+
112
+ def nfkd_one(string)
113
+ string = string.dup
114
+ position = 0
115
+ while position < string.length
116
+ if decomposition = KOMPATIBLE_TABLE[string[position]]
117
+ string[position] = decomposition
118
+ else
119
+ position += 1
120
+ end
121
+ end
122
+ string
123
+ end
124
+
125
+ def nfc_one(string)
126
+ nfd_string = nfd_one string
127
+ start = nfd_string[0]
128
+ last_class = CLASS_TABLE[start] - 1
129
+ accents = ''
130
+ nfd_string[1..-1].each_char do |accent|
131
+ accent_class = CLASS_TABLE[accent]
132
+ if last_class < accent_class && composite = COMPOSITION_TABLE[start+accent]
133
+ start = composite
134
+ else
135
+ accents += accent
136
+ last_class = accent_class
137
+ end
138
+ end
139
+ hangul_comp_one(start + accents)
140
+ end
141
+
142
+ def normalize(string, form = :nfc)
143
+ encoding = string.encoding
144
+ if encoding == Encoding::UTF_8
145
+ case form
146
+ when :nfc then
147
+ string.gsub(REGEXP_C, NF_HASH_C)
148
+ when :nfd then
149
+ string.gsub(REGEXP_D, NF_HASH_D)
150
+ when :nfkc then
151
+ string.gsub(REGEXP_K, NF_HASH_K).gsub(REGEXP_C, NF_HASH_C)
152
+ when :nfkd then
153
+ string.gsub(REGEXP_K, NF_HASH_K).gsub(REGEXP_D, NF_HASH_D)
154
+ else
155
+ raise ArgumentError, "Invalid normalization form #{form}."
156
+ end
157
+ else
158
+ normalize(string.encode(Encoding::UTF_8), form).encode(encoding)
159
+ end
160
+ end
161
+
162
+ def normalized?(string, form = :nfc)
163
+ string = string.encode(Encoding::UTF_8) unless string.encoding == Encoding::UTF_8
164
+ case form
165
+ when :nfc then
166
+ string.scan(REGEXP_C) do |match|
167
+ return false if NF_HASH_C[match] != match
168
+ end
169
+ true
170
+ when :nfd then
171
+ string.scan(REGEXP_D) do |match|
172
+ return false if NF_HASH_D[match] != match
173
+ end
174
+ true
175
+ when :nfkc then
176
+ normalized?(string, :nfc) && string !~ REGEXP_K
177
+ when :nfkd then
178
+ normalized?(string, :nfd) && string !~ REGEXP_K
179
+ else
180
+ raise ArgumentError, "Invalid normalization form #{form}."
181
+ end
182
+ end
183
+
184
+ end
185
+ end
@@ -0,0 +1,198 @@
1
+ # encoding: utf-8
2
+
3
+ # Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
4
+ # available under the same licence as Ruby itself
5
+ # (see http://www.ruby-lang.org/en/LICENSE.txt)
6
+
7
+ class Eprun
8
+ class << self
9
+
10
+ ## Constant for max hash capacity to avoid DoS attack
11
+ MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
12
+
13
+ ## Regular Expressions and Hash Constants
14
+ REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
15
+ REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
16
+ REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
17
+
18
+ NF_HASH_D = Hash.new do |hash, key|
19
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
20
+ hash[key] = Eprun.nfd_one(key).pack("U*")
21
+ end
22
+
23
+ NF_HASH_C = Hash.new do |hash, key|
24
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
25
+ hash[key] = Eprun.nfc_one(key).pack("U*")
26
+ end
27
+
28
+ NF_HASH_K = Hash.new do |hash, key|
29
+ hash.delete hash.first[0] if hash.length > MAX_HASH_LENGTH # prevent DoS attack
30
+ hash[key] = Eprun.nfkd_one(key).pack("U*")
31
+ end
32
+
33
+ def nf_hash_d
34
+ NF_HASH_D
35
+ end
36
+
37
+ def nf_hash_c
38
+ NF_HASH_C
39
+ end
40
+
41
+ def nf_hash_k
42
+ NF_HASH_K
43
+ end
44
+
45
+ ## Constants For Hangul
46
+ SBASE = 0xAC00
47
+ LBASE = 0x1100
48
+ VBASE = 0x1161
49
+ TBASE = 0x11A7
50
+ LCOUNT = 19
51
+ VCOUNT = 21
52
+ TCOUNT = 28
53
+ NCOUNT = VCOUNT * TCOUNT
54
+ SCOUNT = LCOUNT * NCOUNT
55
+
56
+ def get_codepoints(source)
57
+ if source.is_a?(Array)
58
+ source
59
+ elsif source.is_a?(String)
60
+ source.unpack("U*")
61
+ else
62
+ raise ArgumentError, "Source must be a string or an array."
63
+ end
64
+ end
65
+
66
+ ## Hangul Algorithm
67
+ def hangul_decomp_one(target)
68
+ cps = get_codepoints(target)
69
+ sIndex = cps.first - SBASE
70
+ return target if sIndex < 0 || sIndex >= SCOUNT
71
+ l = LBASE + sIndex / NCOUNT
72
+ v = VBASE + (sIndex % NCOUNT) / TCOUNT
73
+ t = TBASE + sIndex % TCOUNT
74
+ (t == TBASE ? [l, v] : [l, v, t]) + cps[1..-1]
75
+ end
76
+
77
+ def hangul_comp_one(string)
78
+ cps = get_codepoints(string)
79
+ length = cps.length
80
+
81
+ in_range = length > 1 &&
82
+ 0 <= (lead = cps[0] - LBASE) &&
83
+ lead < LCOUNT &&
84
+ 0 <= (vowel = cps[1] - VBASE) &&
85
+ vowel < VCOUNT
86
+
87
+ if in_range
88
+ lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
89
+ if length > 2 && 0 <= (trail = cps[2] - TBASE) && trail < TCOUNT
90
+ [lead_vowel + trail] + cps[3..-1]
91
+ else
92
+ [lead_vowel] + cps[2..-1]
93
+ end
94
+ else
95
+ string
96
+ end
97
+ end
98
+
99
+ ## Canonical Ordering
100
+ def canonical_ordering_one(string)
101
+ cps = get_codepoints(string)
102
+ sorting = cps.collect { |c| [c, CLASS_TABLE[c]] }
103
+
104
+ (sorting.length - 2).downto(0) do |i| # bubble sort
105
+ (0..i).each do |j|
106
+ later_class = sorting[j + 1].last
107
+ if 0 < later_class && later_class < sorting[j].last
108
+ sorting[j], sorting[j + 1] = sorting[j + 1], sorting[j]
109
+ end
110
+ end
111
+ end
112
+ sorting.collect(&:first)
113
+ end
114
+
115
+ ## Normalization Forms for Patterns (not whole Strings)
116
+ def nfd_one(string)
117
+ cps = get_codepoints(string)
118
+ cps = cps.inject([]) do |ret, cp|
119
+ if decomposition = DECOMPOSITION_TABLE[cp]
120
+ ret += decomposition
121
+ else
122
+ ret << cp
123
+ end
124
+ end
125
+
126
+ canonical_ordering_one(hangul_decomp_one(cps))
127
+ end
128
+
129
+ def nfkd_one(string)
130
+ cps = get_codepoints(string)
131
+ final_cps = []
132
+ position = 0
133
+ while position < cps.length
134
+ if decomposition = KOMPATIBLE_TABLE[cps[position]]
135
+ final_cps += nfkd_one(decomposition)
136
+ else
137
+ final_cps << cps[position]
138
+ end
139
+ position += 1
140
+ end
141
+ final_cps
142
+ end
143
+
144
+ def nfc_one(string)
145
+ nfd_cps = nfd_one(string)
146
+ start = nfd_cps[0]
147
+ last_class = CLASS_TABLE[start] - 1
148
+ accents = []
149
+ nfd_cps[1..-1].each do |accent_cp|
150
+ accent_class = CLASS_TABLE[accent_cp]
151
+ if last_class < accent_class && composite = COMPOSITION_TABLE[[start, accent_cp]]
152
+ start = composite
153
+ else
154
+ accents << accent_cp
155
+ last_class = accent_class
156
+ end
157
+ end
158
+ hangul_comp_one([start] + accents)
159
+ end
160
+
161
+ def normalize(string, form = :nfc)
162
+ case form
163
+ when :nfc then
164
+ string.gsub(REGEXP_C) { |s| NF_HASH_C[s] }
165
+ when :nfd then
166
+ string.gsub(REGEXP_D) { |s| NF_HASH_D[s] }
167
+ when :nfkc then
168
+ string.gsub(REGEXP_K) { |s| NF_HASH_K[s] }.gsub(REGEXP_C) { |s| NF_HASH_C[s] }
169
+ when :nfkd then
170
+ string.gsub(REGEXP_K) { |s| NF_HASH_K[s] }.gsub(REGEXP_D) { |s| NF_HASH_D[s] }
171
+ else
172
+ raise ArgumentError, "Invalid normalization form #{form}."
173
+ end
174
+ end
175
+
176
+ def normalized?(string, form = :nfc)
177
+ case form
178
+ when :nfc then
179
+ string.scan REGEXP_C do |match|
180
+ return false if NF_HASH_C[match] != match
181
+ end
182
+ true
183
+ when :nfd then
184
+ string.scan REGEXP_D do |match|
185
+ return false if NF_HASH_D[match] != match
186
+ end
187
+ true
188
+ when :nfkc then
189
+ normalized?(string, :nfc) && string !~ REGEXP_K
190
+ when :nfkd then
191
+ normalized?(string, :nfd) && string !~ REGEXP_K
192
+ else
193
+ raise ArgumentError, "Invalid normalization form #{form}."
194
+ end
195
+ end
196
+
197
+ end
198
+ end # class