kyanite 0.8.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +1 -1
- data/lib/kyanite/string/chars.rb +51 -32
- data/lib/kyanite/string/chars_const.rb +7 -7
- data/test/string/test_chars.rb +52 -24
- data/version.rb +1 -1
- metadata +2 -2
data/History.rdoc
CHANGED
data/lib/kyanite/string/chars.rb
CHANGED
@@ -45,56 +45,75 @@ class String
|
|
45
45
|
end
|
46
46
|
|
47
47
|
|
48
|
-
# Reduces
|
48
|
+
# Reduces a rich unicode string to a very limited character set like humans do. Example:
|
49
|
+
# "Céline hören".reduce
|
50
|
+
# => "Celine hoeren"
|
51
|
+
#
|
52
|
+
# Handles all characters from ISO/IEC 8859-1 and CP1252
|
53
|
+
# like humans do, not just deleting the accents.
|
54
|
+
# So it's not a 1:1 translation, some unicode characters are translated to
|
55
|
+
# multible characters. Example:
|
56
|
+
# "ÄÖÜäöüß".reduce
|
57
|
+
# => "AeOeUeaeoeuess"
|
58
|
+
#
|
59
|
+
# For many unicode characters, this behaviour is based on +UnicodeUtils.nfkd+. Example:
|
49
60
|
# ffi = "\uFB03"
|
50
61
|
# ix = "\u2168"
|
51
62
|
# high23="²³"
|
52
63
|
# high5 = "\u2075"
|
53
64
|
# all = ffi + ix + high23 + high5
|
54
|
-
# all.
|
65
|
+
# all.reduce
|
55
66
|
# => "ffiIX235"
|
56
67
|
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
# "
|
60
|
-
#
|
68
|
+
# You can preserve some characters, e.g. all special characters of a specific language. Example:
|
69
|
+
# "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
|
70
|
+
# => "Celine hören 10EUR"
|
71
|
+
#
|
72
|
+
# Newlines are preserved by default, but all other nonprintable ascii characters below \\x20 are removed.
|
61
73
|
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
74
|
+
# There is also a fast mode. It's about 10 times faster, but it supports only 1:1 translation.
|
75
|
+
# "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true )
|
76
|
+
# => "Celine hören 10€"
|
77
|
+
#
|
78
|
+
# "ÄÖÜäöü߀".reduce( :fast => true )
|
79
|
+
# => "AOUaous"
|
80
|
+
#
|
81
|
+
# Your result will only contain these characters:
|
82
|
+
# * printable letters and basic symbols of the 7bit ASCII charset (\\x20..\\x7e)
|
83
|
+
# * preserved characters as defined in the options (max 18)
|
84
|
+
# * newlines (\\x0a and \\x0d)
|
67
85
|
#
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
86
|
+
# Options:
|
87
|
+
# [:preserve] Special characters to preserve. You can only preserve up to 18 characters.
|
88
|
+
# [:fast] Fast mode, if true. About 10 times faster, but it supports only 1:1 translation.
|
89
|
+
#
|
90
|
+
# @return [String]
|
91
|
+
def reduce( options ={} )
|
92
|
+
preserve = options[:preserve] || ''
|
93
|
+
raise ArgumentError, 'max preserve string length is 18 chars' if preserve.length > 18
|
94
|
+
|
95
|
+
result = self.delete("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e-\x1f")
|
96
|
+
result.tr!(preserve, "\x0e-\x1f") if preserve.length > 0
|
97
|
+
|
98
|
+
result = result.to_ascii_extra_chars unless options[:fast]
|
99
|
+
result.tr!(TR_FULL, TR_REDUCED)
|
100
|
+
result = UnicodeUtils.nfkd(result) unless options[:fast]
|
101
|
+
|
102
|
+
result.delete!("^\x09-\x7e")
|
103
|
+
result.tr!("\x0e-\x1f", preserve) if preserve.length > 0
|
104
|
+
result
|
72
105
|
end
|
73
106
|
|
74
107
|
|
75
108
|
|
76
|
-
#
|
77
|
-
# About 10 times faster than with +UnicodeUtils+.
|
78
|
-
# 1. Converts àáâăäãāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰ etc. to aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAA.
|
79
|
-
# 2. Then removes all non-Ascii-chars.
|
80
|
-
# 3. Then removes all non-printable Ascii-chars.
|
81
|
-
# 4. Caution: Also Newlines are removed.
|
82
|
-
#
|
83
|
-
# See tests and examples {TestKyaniteStringChars#test_reduce94_a here}.
|
109
|
+
# @deprecated
|
84
110
|
# @return [String]
|
85
111
|
def reduce94( options={} )
|
86
|
-
|
112
|
+
reduce( {:fast => true}.merge(options) )
|
87
113
|
end
|
88
114
|
|
89
115
|
|
90
|
-
|
91
|
-
# @return [String]
|
92
|
-
def reduce94!( options={} )
|
93
|
-
self.gsub!( 'ß', options[:german_sz] ) if options[:german_sz]
|
94
|
-
self.tr!(TR_FULL, TR_REDUCED)
|
95
|
-
self.delete!('^ -~')
|
96
|
-
self
|
97
|
-
end
|
116
|
+
|
98
117
|
|
99
118
|
# Reduziert den String auf ein Base53-Encoding,
|
100
119
|
# bestehend aus Großbuchstaben, Minuszeichen und zu Kleinbuchstaben umgeformten Sonderzeichen.
|
@@ -16,12 +16,12 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
|
|
16
16
|
klammer_auf = "\u227a\u226a\u3008\u276c\u2329\u25c1\u25c0"
|
17
17
|
klammer_zu = "\u227b\u226b\u3009\u276d\u232a\u25b7\u25b6"
|
18
18
|
|
19
|
-
# Sowohl reduce94 als auch
|
19
|
+
# Sowohl reduce94 als auch reduce werden diese Zeichen übersetzen.
|
20
20
|
# Zeichen, die TR_FULL ergänzen und die UnicodeUtils.nfkd nicht korrekt umsetzt.
|
21
21
|
tr_full_b = %q{£₤¢‹¥›•«×»÷‚‘ƒ’ˆ§´¡„¿“¦”†‡µ′″°¤∗·⋅} + leerzeichen + klammer_auf + klammer_zu
|
22
22
|
tr_reduced_b = %q{LLc"Y"*"*"/''f'^P'!"?"|"~~u'"~~***} + (" "*leerzeichen.length) + ("<"*klammer_auf.length) + (">"*klammer_zu.length)
|
23
23
|
|
24
|
-
# Nur
|
24
|
+
# Nur reduce wird diese Zeichen übersetzen.
|
25
25
|
# Zeichen, die in TR_FULL schon drin sind und die UnicodeUtils.nfkd nicht korrekt umsetzt
|
26
26
|
tr_full_c = %q{ØøðđÐĐħĦıĸłŁŧþŦÞаАбБцчЦЧдДеэЕЭфФгГхХийИЙюяЮЯкКлЛмМнНоОпПрРсшщСШЩтТуУвВжзЖЗ}
|
27
27
|
tr_reduced_c = %q{OoddDDhHiklLttTTaAbBccCCdDeeEEfFgGhHiiIIjjJJkKlLmMnNoOpPrRsssSSStTuUvVzzZZ}
|
@@ -29,7 +29,7 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
|
|
29
29
|
|
30
30
|
|
31
31
|
|
32
|
-
# Nur
|
32
|
+
# Nur reduce wird diese Zeichen übersetzen.
|
33
33
|
TR_EXTRA_CHARS = [
|
34
34
|
[/ß/, 'ss'],
|
35
35
|
[/Ö/, 'Oe'],
|
@@ -267,7 +267,7 @@ if $0 == __FILE__ then
|
|
267
267
|
see "Überprüfe TR_EXTRA_CHARS"
|
268
268
|
see "========================"
|
269
269
|
see
|
270
|
-
see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "
|
270
|
+
see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
|
271
271
|
startline = 14
|
272
272
|
i = 0
|
273
273
|
all = ""
|
@@ -280,7 +280,7 @@ if $0 == __FILE__ then
|
|
280
280
|
c.to_array_of_hex, # sein Code in HEX
|
281
281
|
c, # das Zeichen
|
282
282
|
c.reduce94, # was reduce94 daraus macht
|
283
|
-
c.
|
283
|
+
c.reduce, # was reduce daraus macht
|
284
284
|
UnicodeUtils.char_type(c)
|
285
285
|
|
286
286
|
i+=1
|
@@ -293,7 +293,7 @@ if $0 == __FILE__ then
|
|
293
293
|
see "Überprüfe TR_FULL"
|
294
294
|
see "================="
|
295
295
|
see
|
296
|
-
see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "
|
296
|
+
see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
|
297
297
|
i = 0
|
298
298
|
all = ""
|
299
299
|
#TR_FULL_TO_ASCII.each_char do |c|
|
@@ -305,7 +305,7 @@ if $0 == __FILE__ then
|
|
305
305
|
c.to_array_of_hex, # sein Code in HEX
|
306
306
|
c, # das Zeichen
|
307
307
|
c.reduce94, # was reduce94 daraus macht
|
308
|
-
c.
|
308
|
+
c.reduce, # was reduce daraus macht
|
309
309
|
UnicodeUtils.char_type(c)
|
310
310
|
|
311
311
|
i+=1
|
data/test/string/test_chars.rb
CHANGED
@@ -44,7 +44,7 @@ class TestKyaniteStringChars < UnitTest
|
|
44
44
|
assert_equal 0, all.to_a.to_set.size-i-1, "TR_FULL: Dup in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
|
45
45
|
assert c.to_array_of_codepoints[0] > 127, "TR_FULL: Trivialität in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
|
46
46
|
assert r.to_array_of_codepoints[0] <= 127, "TR_FULL: Zeichen Nr. #{i} Zeichen #{c} >> #{r} wird nicht in ASCII umgesetzt"
|
47
|
-
assert_equal c.reduce94, c.
|
47
|
+
assert_equal c.reduce94, c.reduce[0]
|
48
48
|
i+=1
|
49
49
|
end
|
50
50
|
end
|
@@ -87,7 +87,7 @@ class TestKyaniteStringChars < UnitTest
|
|
87
87
|
full = 'ªàáâăãāåạąảấầắằÀÁÂĂÃĀÅẠĄẢẤẦẮẰ'
|
88
88
|
reduced = 'aaaaaaaaaaaaaaaAAAAAAAAAAAAAA'
|
89
89
|
assert_equal reduced, full.reduce94
|
90
|
-
assert_equal reduced, full.
|
90
|
+
assert_equal reduced, full.reduce
|
91
91
|
end
|
92
92
|
|
93
93
|
def test_to_ascii_b
|
@@ -95,7 +95,7 @@ class TestKyaniteStringChars < UnitTest
|
|
95
95
|
reduced1 = 'cccccCCCCCdDeeeeeeeeeeEEEEEEEEEE'
|
96
96
|
reduced2 = 'ccccchCCCCChdDeeeeeeeeeeEEEEEEEEEE'
|
97
97
|
assert_equal reduced1, full.reduce94
|
98
|
-
assert_equal reduced2, full.
|
98
|
+
assert_equal reduced2, full.reduce
|
99
99
|
end
|
100
100
|
|
101
101
|
def test_to_ascii_c
|
@@ -103,14 +103,14 @@ class TestKyaniteStringChars < UnitTest
|
|
103
103
|
reduced1 = 'ggggGGGGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
|
104
104
|
reduced2 = 'ggghgGGGhGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
|
105
105
|
assert_equal reduced1, full.reduce94
|
106
|
-
assert_equal reduced2, full.
|
106
|
+
assert_equal reduced2, full.reduce
|
107
107
|
end
|
108
108
|
|
109
109
|
def test_to_ascii_e
|
110
110
|
full = 'ńňñņʼnŃŇÑŅòóôŏõōőơÒÓÔŎÕŌŐƠ'
|
111
111
|
reduced = 'nnnnnNNNNooooooooOOOOOOOO'
|
112
112
|
assert_equal reduced, full.reduce94
|
113
|
-
assert_equal reduced, full.
|
113
|
+
assert_equal reduced, full.reduce
|
114
114
|
end
|
115
115
|
|
116
116
|
def test_to_ascii_f
|
@@ -118,24 +118,24 @@ class TestKyaniteStringChars < UnitTest
|
|
118
118
|
reduced1 = 'rrrRRRssssSSSSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzZZZ'
|
119
119
|
reduced2 = 'rrrRRRssshsSSShSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzhZZZh'
|
120
120
|
assert_equal reduced1, full.reduce94
|
121
|
-
assert_equal reduced2, full.
|
121
|
+
assert_equal reduced2, full.reduce
|
122
122
|
end
|
123
123
|
|
124
124
|
def test_to_ascii_zusammengesetzt
|
125
125
|
full = 'ijIJſ…'
|
126
126
|
reduced = 'ijIJs...'
|
127
|
-
assert_equal reduced, full.
|
127
|
+
assert_equal reduced, full.reduce
|
128
128
|
end
|
129
129
|
|
130
130
|
def test_to_ascii_same_same
|
131
131
|
same_same = '^!"$%&/()=?@*+~#<>|,;:.-_ {[]}\\'
|
132
|
-
assert_equal same_same, same_same.
|
132
|
+
assert_equal same_same, same_same.reduce
|
133
133
|
same_same = "'0123456789"
|
134
|
-
assert_equal same_same, same_same.
|
134
|
+
assert_equal same_same, same_same.reduce
|
135
135
|
same_same = 'abcdefghijklmnopqrstuvwxyz'
|
136
|
-
assert_equal same_same, same_same.
|
136
|
+
assert_equal same_same, same_same.reduce
|
137
137
|
same_same = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
138
|
-
assert_equal same_same, same_same.
|
138
|
+
assert_equal same_same, same_same.reduce
|
139
139
|
end
|
140
140
|
|
141
141
|
|
@@ -143,7 +143,7 @@ class TestKyaniteStringChars < UnitTest
|
|
143
143
|
full = '¯¨'
|
144
144
|
reduced = ' ' * full.length
|
145
145
|
assert_equal 2, full.length
|
146
|
-
assert_equal reduced, full.
|
146
|
+
assert_equal reduced, full.reduce
|
147
147
|
end
|
148
148
|
|
149
149
|
def test_to_ascii_s
|
@@ -155,29 +155,63 @@ class TestKyaniteStringChars < UnitTest
|
|
155
155
|
reduced1 = "sOUAoua"
|
156
156
|
reduced2 = "ffiIX235EURssOeUeAeoeueae"
|
157
157
|
assert_equal reduced1, full.reduce94
|
158
|
-
assert_equal reduced2, full.
|
158
|
+
assert_equal reduced2, full.reduce
|
159
159
|
end
|
160
160
|
|
161
161
|
def test_LANG_SPECIAL_CHARS
|
162
162
|
LANG_SPECIAL_CHARS .each do | lang, (full, reduced) |
|
163
|
-
#see lang, full, reduced, full.
|
164
|
-
assert_equal reduced, full.
|
163
|
+
#see lang, full, reduced, full.reduce, full.reduce94
|
164
|
+
assert_equal reduced, full.reduce
|
165
165
|
end
|
166
166
|
end
|
167
167
|
|
168
168
|
def test_spaces
|
169
169
|
spaces = "\u0020\u00a0\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u202f\u205f\u3000\u2420\u2423"
|
170
|
-
assert_equal spaces.
|
170
|
+
assert_equal spaces.reduce, " " * spaces.length
|
171
171
|
assert_equal spaces.reduce94, " " * spaces.length
|
172
172
|
end
|
173
173
|
|
174
174
|
|
175
175
|
def test_minus_signs
|
176
176
|
minus = "\u00ac\u2212\u2010\u2011\u2012\u2013\u2014\u2015\u2500"
|
177
|
-
assert_equal minus.
|
177
|
+
assert_equal minus.reduce, "-" * minus.length
|
178
178
|
#assert_equal spaces.reduce94, " " * spaces.length
|
179
179
|
end
|
180
180
|
|
181
|
+
|
182
|
+
def test_preserve
|
183
|
+
# 0123456789012345678901234567890123456789
|
184
|
+
test = "ßàáâăäãāāāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰćĉčçċĆĈČÇĊďðđĎÐĐèéêěĕëēėęếÈÉÊĚĔËĒĖĘẾĝğġģĜĞĠĢĥħĤĦìíîĭïĩīıįijÌÍÎĬÏĨĪİĮIJĵĴķĶĺľłļŀĹĽŁĻĿńňñņʼnŋŃŇÑŅŊòóôŏöõōøőơœÒÓÔŎÖÕŌØŐƠŒŕřŗŔŘŖśŝšßşŚŜŠŞţťŧþŢŤŦÞùúûŭüũūůűųưÙÚÛŬÜŨŪŮŰŲƯŵŴýŷÿÝŶŸźżžŹŻŽ"
|
185
|
+
belassen = test[10..27]
|
186
|
+
exp = "ssaaaaaeaaaaåạąæảấầắằÀÁÂĂÄÃĀÅẠAAEAAAAAccccchCCCCChdddDDDeeeeeeeeeeEEEEEEEEEEggghgGGGhGhhHHiiiiiiiiiijIIIIIIIIIIJjJkKlllllLLLLLnnnnnnjNNNNNJoooooeooooooeOOOOOeOOOOOOErrrRRRssshsssSSShSttttTTTTuuuuueuuuuuuUUUUUeUUUUUUwWyyyYYYzzzhZZZh"
|
187
|
+
assert_equal exp, test.reduce(:preserve => belassen)
|
188
|
+
assert_raise ArgumentError do
|
189
|
+
belassen = test[10..28]
|
190
|
+
test.reduce(:preserve => belassen)
|
191
|
+
end
|
192
|
+
test = "Háâaäãaållo\nWelt"
|
193
|
+
assert_equal "Haaaäaaallo\nWelt", test.reduce( :preserve =>"äöüßÄÖÜ" )
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
def test_examples
|
200
|
+
assert_equal "Celine hoeren", "Céline hören".reduce
|
201
|
+
assert_equal "AeOeUeaeoeuess", "ÄÖÜäöüß".reduce
|
202
|
+
assert_equal "Celine hören 10EUR", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
|
203
|
+
assert_equal "Celine hören 10€", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true)
|
204
|
+
assert_equal "AOUaous", "ÄÖÜäöü߀".reduce( :fast => true )
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
def test_newlines_and_nonprintables
|
209
|
+
test = "Céli\x00ne\nhöre\x0c\x0e\x0fn"
|
210
|
+
assert_equal "Celine\nhören", test.reduce( :preserve => "ÄÖÜäöüß")
|
211
|
+
assert_equal "Celine\nhoeren", test.reduce
|
212
|
+
assert_equal "Celine\nhoren", test.reduce(:fast => true )
|
213
|
+
end
|
214
|
+
|
181
215
|
|
182
216
|
|
183
217
|
|
@@ -247,13 +281,7 @@ ENDOFSTRING
|
|
247
281
|
assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'z')
|
248
282
|
assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'Z')
|
249
283
|
assert_equal 'SCHEISSE', 'Scheiße'.reduce53(:german_sz => 'SS')
|
250
|
-
|
251
|
-
# geht vielleicht in Ruby 1.9
|
252
|
-
assert_equal 'Scheize', 'Scheiße'.reduce94(:german_sz => 'z')
|
253
|
-
assert_equal 'ScheiZe', 'Scheiße'.reduce94(:german_sz => 'Z')
|
254
|
-
assert_equal 'Scheisse', 'Scheiße'.reduce94(:german_sz => 'ss')
|
255
|
-
assert_equal 'Schei$e', 'Scheiße'.reduce94(:german_sz => '$')
|
256
|
-
assert_equal 'Schei$e', 'Schei$e'.reduce94
|
284
|
+
assert_equal 'Scheiß Arsche', 'Scheiß Ärsche'.reduce94(:preserve => 'ß')
|
257
285
|
end
|
258
286
|
|
259
287
|
|
data/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kyanite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: drumherum
|