kyanite 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +1 -1
- data/lib/kyanite/string/chars.rb +51 -32
- data/lib/kyanite/string/chars_const.rb +7 -7
- data/test/string/test_chars.rb +52 -24
- data/version.rb +1 -1
- metadata +2 -2
data/History.rdoc
CHANGED
data/lib/kyanite/string/chars.rb
CHANGED
@@ -45,56 +45,75 @@ class String
|
|
45
45
|
end
|
46
46
|
|
47
47
|
|
48
|
-
# Reduces
|
48
|
+
# Reduces a rich unicode string to a very limited character set like humans do. Example:
|
49
|
+
# "Céline hören".reduce
|
50
|
+
# => "Celine hoeren"
|
51
|
+
#
|
52
|
+
# Handles all characters from ISO/IEC 8859-1 and CP1252
|
53
|
+
# like humans do, not just deleting the accents.
|
54
|
+
# So it's not a 1:1 translation, some unicode characters are translated to
|
55
|
+
# multible characters. Example:
|
56
|
+
# "ÄÖÜäöüß".reduce
|
57
|
+
# => "AeOeUeaeoeuess"
|
58
|
+
#
|
59
|
+
# For many unicode characters, this behaviour is based on +UnicodeUtils.nfkd+. Example:
|
49
60
|
# ffi = "\uFB03"
|
50
61
|
# ix = "\u2168"
|
51
62
|
# high23="²³"
|
52
63
|
# high5 = "\u2075"
|
53
64
|
# all = ffi + ix + high23 + high5
|
54
|
-
# all.
|
65
|
+
# all.reduce
|
55
66
|
# => "ffiIX235"
|
56
67
|
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
# "
|
60
|
-
#
|
68
|
+
# You can preserve some characters, e.g. all special characters of a specific language. Example:
|
69
|
+
# "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
|
70
|
+
# => "Celine hören 10EUR"
|
71
|
+
#
|
72
|
+
# Newlines are preserved by default, but all other nonprintable ascii characters below \\x20 are removed.
|
61
73
|
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
74
|
+
# There is also a fast mode. It's about 10 times faster, but it supports only 1:1 translation.
|
75
|
+
# "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true )
|
76
|
+
# => "Celine hören 10€"
|
77
|
+
#
|
78
|
+
# "ÄÖÜäöü߀".reduce( :fast => true )
|
79
|
+
# => "AOUaous"
|
80
|
+
#
|
81
|
+
# Your result will only contain these characters:
|
82
|
+
# * printable letters and basic symbols of the 7bit ASCII charset (\\x20..\\x7e)
|
83
|
+
# * preserved characters as defined in the options (max 18)
|
84
|
+
# * newlines (\\x0a and \\x0d)
|
67
85
|
#
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
86
|
+
# Options:
|
87
|
+
# [:preserve] Special characters to preserve. You can only preserve up to 18 characters.
|
88
|
+
# [:fast] Fast mode, if true. About 10 times faster, but it supports only 1:1 translation.
|
89
|
+
#
|
90
|
+
# @return [String]
|
91
|
+
def reduce( options ={} )
|
92
|
+
preserve = options[:preserve] || ''
|
93
|
+
raise ArgumentError, 'max preserve string length is 18 chars' if preserve.length > 18
|
94
|
+
|
95
|
+
result = self.delete("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e-\x1f")
|
96
|
+
result.tr!(preserve, "\x0e-\x1f") if preserve.length > 0
|
97
|
+
|
98
|
+
result = result.to_ascii_extra_chars unless options[:fast]
|
99
|
+
result.tr!(TR_FULL, TR_REDUCED)
|
100
|
+
result = UnicodeUtils.nfkd(result) unless options[:fast]
|
101
|
+
|
102
|
+
result.delete!("^\x09-\x7e")
|
103
|
+
result.tr!("\x0e-\x1f", preserve) if preserve.length > 0
|
104
|
+
result
|
72
105
|
end
|
73
106
|
|
74
107
|
|
75
108
|
|
76
|
-
#
|
77
|
-
# About 10 times faster than with +UnicodeUtils+.
|
78
|
-
# 1. Converts àáâăäãāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰ etc. to aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAA.
|
79
|
-
# 2. Then removes all non-Ascii-chars.
|
80
|
-
# 3. Then removes all non-printable Ascii-chars.
|
81
|
-
# 4. Caution: Also Newlines are removed.
|
82
|
-
#
|
83
|
-
# See tests and examples {TestKyaniteStringChars#test_reduce94_a here}.
|
109
|
+
# @deprecated
|
84
110
|
# @return [String]
|
85
111
|
def reduce94( options={} )
|
86
|
-
|
112
|
+
reduce( {:fast => true}.merge(options) )
|
87
113
|
end
|
88
114
|
|
89
115
|
|
90
|
-
|
91
|
-
# @return [String]
|
92
|
-
def reduce94!( options={} )
|
93
|
-
self.gsub!( 'ß', options[:german_sz] ) if options[:german_sz]
|
94
|
-
self.tr!(TR_FULL, TR_REDUCED)
|
95
|
-
self.delete!('^ -~')
|
96
|
-
self
|
97
|
-
end
|
116
|
+
|
98
117
|
|
99
118
|
# Reduziert den String auf ein Base53-Encoding,
|
100
119
|
# bestehend aus Großbuchstaben, Minuszeichen und zu Kleinbuchstaben umgeformten Sonderzeichen.
|
@@ -16,12 +16,12 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
|
|
16
16
|
klammer_auf = "\u227a\u226a\u3008\u276c\u2329\u25c1\u25c0"
|
17
17
|
klammer_zu = "\u227b\u226b\u3009\u276d\u232a\u25b7\u25b6"
|
18
18
|
|
19
|
-
# Sowohl reduce94 als auch
|
19
|
+
# Sowohl reduce94 als auch reduce werden diese Zeichen übersetzen.
|
20
20
|
# Zeichen, die TR_FULL ergänzen und die UnicodeUtils.nfkd nicht korrekt umsetzt.
|
21
21
|
tr_full_b = %q{£₤¢‹¥›•«×»÷‚‘ƒ’ˆ§´¡„¿“¦”†‡µ′″°¤∗·⋅} + leerzeichen + klammer_auf + klammer_zu
|
22
22
|
tr_reduced_b = %q{LLc"Y"*"*"/''f'^P'!"?"|"~~u'"~~***} + (" "*leerzeichen.length) + ("<"*klammer_auf.length) + (">"*klammer_zu.length)
|
23
23
|
|
24
|
-
# Nur
|
24
|
+
# Nur reduce wird diese Zeichen übersetzen.
|
25
25
|
# Zeichen, die in TR_FULL schon drin sind und die UnicodeUtils.nfkd nicht korrekt umsetzt
|
26
26
|
tr_full_c = %q{ØøðđÐĐħĦıĸłŁŧþŦÞаАбБцчЦЧдДеэЕЭфФгГхХийИЙюяЮЯкКлЛмМнНоОпПрРсшщСШЩтТуУвВжзЖЗ}
|
27
27
|
tr_reduced_c = %q{OoddDDhHiklLttTTaAbBccCCdDeeEEfFgGhHiiIIjjJJkKlLmMnNoOpPrRsssSSStTuUvVzzZZ}
|
@@ -29,7 +29,7 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
|
|
29
29
|
|
30
30
|
|
31
31
|
|
32
|
-
# Nur
|
32
|
+
# Nur reduce wird diese Zeichen übersetzen.
|
33
33
|
TR_EXTRA_CHARS = [
|
34
34
|
[/ß/, 'ss'],
|
35
35
|
[/Ö/, 'Oe'],
|
@@ -267,7 +267,7 @@ if $0 == __FILE__ then
|
|
267
267
|
see "Überprüfe TR_EXTRA_CHARS"
|
268
268
|
see "========================"
|
269
269
|
see
|
270
|
-
see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "
|
270
|
+
see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
|
271
271
|
startline = 14
|
272
272
|
i = 0
|
273
273
|
all = ""
|
@@ -280,7 +280,7 @@ if $0 == __FILE__ then
|
|
280
280
|
c.to_array_of_hex, # sein Code in HEX
|
281
281
|
c, # das Zeichen
|
282
282
|
c.reduce94, # was reduce94 daraus macht
|
283
|
-
c.
|
283
|
+
c.reduce, # was reduce daraus macht
|
284
284
|
UnicodeUtils.char_type(c)
|
285
285
|
|
286
286
|
i+=1
|
@@ -293,7 +293,7 @@ if $0 == __FILE__ then
|
|
293
293
|
see "Überprüfe TR_FULL"
|
294
294
|
see "================="
|
295
295
|
see
|
296
|
-
see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "
|
296
|
+
see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
|
297
297
|
i = 0
|
298
298
|
all = ""
|
299
299
|
#TR_FULL_TO_ASCII.each_char do |c|
|
@@ -305,7 +305,7 @@ if $0 == __FILE__ then
|
|
305
305
|
c.to_array_of_hex, # sein Code in HEX
|
306
306
|
c, # das Zeichen
|
307
307
|
c.reduce94, # was reduce94 daraus macht
|
308
|
-
c.
|
308
|
+
c.reduce, # was reduce daraus macht
|
309
309
|
UnicodeUtils.char_type(c)
|
310
310
|
|
311
311
|
i+=1
|
data/test/string/test_chars.rb
CHANGED
@@ -44,7 +44,7 @@ class TestKyaniteStringChars < UnitTest
|
|
44
44
|
assert_equal 0, all.to_a.to_set.size-i-1, "TR_FULL: Dup in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
|
45
45
|
assert c.to_array_of_codepoints[0] > 127, "TR_FULL: Trivialität in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
|
46
46
|
assert r.to_array_of_codepoints[0] <= 127, "TR_FULL: Zeichen Nr. #{i} Zeichen #{c} >> #{r} wird nicht in ASCII umgesetzt"
|
47
|
-
assert_equal c.reduce94, c.
|
47
|
+
assert_equal c.reduce94, c.reduce[0]
|
48
48
|
i+=1
|
49
49
|
end
|
50
50
|
end
|
@@ -87,7 +87,7 @@ class TestKyaniteStringChars < UnitTest
|
|
87
87
|
full = 'ªàáâăãāåạąảấầắằÀÁÂĂÃĀÅẠĄẢẤẦẮẰ'
|
88
88
|
reduced = 'aaaaaaaaaaaaaaaAAAAAAAAAAAAAA'
|
89
89
|
assert_equal reduced, full.reduce94
|
90
|
-
assert_equal reduced, full.
|
90
|
+
assert_equal reduced, full.reduce
|
91
91
|
end
|
92
92
|
|
93
93
|
def test_to_ascii_b
|
@@ -95,7 +95,7 @@ class TestKyaniteStringChars < UnitTest
|
|
95
95
|
reduced1 = 'cccccCCCCCdDeeeeeeeeeeEEEEEEEEEE'
|
96
96
|
reduced2 = 'ccccchCCCCChdDeeeeeeeeeeEEEEEEEEEE'
|
97
97
|
assert_equal reduced1, full.reduce94
|
98
|
-
assert_equal reduced2, full.
|
98
|
+
assert_equal reduced2, full.reduce
|
99
99
|
end
|
100
100
|
|
101
101
|
def test_to_ascii_c
|
@@ -103,14 +103,14 @@ class TestKyaniteStringChars < UnitTest
|
|
103
103
|
reduced1 = 'ggggGGGGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
|
104
104
|
reduced2 = 'ggghgGGGhGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
|
105
105
|
assert_equal reduced1, full.reduce94
|
106
|
-
assert_equal reduced2, full.
|
106
|
+
assert_equal reduced2, full.reduce
|
107
107
|
end
|
108
108
|
|
109
109
|
def test_to_ascii_e
|
110
110
|
full = 'ńňñņʼnŃŇÑŅòóôŏõōőơÒÓÔŎÕŌŐƠ'
|
111
111
|
reduced = 'nnnnnNNNNooooooooOOOOOOOO'
|
112
112
|
assert_equal reduced, full.reduce94
|
113
|
-
assert_equal reduced, full.
|
113
|
+
assert_equal reduced, full.reduce
|
114
114
|
end
|
115
115
|
|
116
116
|
def test_to_ascii_f
|
@@ -118,24 +118,24 @@ class TestKyaniteStringChars < UnitTest
|
|
118
118
|
reduced1 = 'rrrRRRssssSSSSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzZZZ'
|
119
119
|
reduced2 = 'rrrRRRssshsSSShSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzhZZZh'
|
120
120
|
assert_equal reduced1, full.reduce94
|
121
|
-
assert_equal reduced2, full.
|
121
|
+
assert_equal reduced2, full.reduce
|
122
122
|
end
|
123
123
|
|
124
124
|
def test_to_ascii_zusammengesetzt
|
125
125
|
full = 'ijIJſ…'
|
126
126
|
reduced = 'ijIJs...'
|
127
|
-
assert_equal reduced, full.
|
127
|
+
assert_equal reduced, full.reduce
|
128
128
|
end
|
129
129
|
|
130
130
|
def test_to_ascii_same_same
|
131
131
|
same_same = '^!"$%&/()=?@*+~#<>|,;:.-_ {[]}\\'
|
132
|
-
assert_equal same_same, same_same.
|
132
|
+
assert_equal same_same, same_same.reduce
|
133
133
|
same_same = "'0123456789"
|
134
|
-
assert_equal same_same, same_same.
|
134
|
+
assert_equal same_same, same_same.reduce
|
135
135
|
same_same = 'abcdefghijklmnopqrstuvwxyz'
|
136
|
-
assert_equal same_same, same_same.
|
136
|
+
assert_equal same_same, same_same.reduce
|
137
137
|
same_same = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
138
|
-
assert_equal same_same, same_same.
|
138
|
+
assert_equal same_same, same_same.reduce
|
139
139
|
end
|
140
140
|
|
141
141
|
|
@@ -143,7 +143,7 @@ class TestKyaniteStringChars < UnitTest
|
|
143
143
|
full = '¯¨'
|
144
144
|
reduced = ' ' * full.length
|
145
145
|
assert_equal 2, full.length
|
146
|
-
assert_equal reduced, full.
|
146
|
+
assert_equal reduced, full.reduce
|
147
147
|
end
|
148
148
|
|
149
149
|
def test_to_ascii_s
|
@@ -155,29 +155,63 @@ class TestKyaniteStringChars < UnitTest
|
|
155
155
|
reduced1 = "sOUAoua"
|
156
156
|
reduced2 = "ffiIX235EURssOeUeAeoeueae"
|
157
157
|
assert_equal reduced1, full.reduce94
|
158
|
-
assert_equal reduced2, full.
|
158
|
+
assert_equal reduced2, full.reduce
|
159
159
|
end
|
160
160
|
|
161
161
|
def test_LANG_SPECIAL_CHARS
|
162
162
|
LANG_SPECIAL_CHARS .each do | lang, (full, reduced) |
|
163
|
-
#see lang, full, reduced, full.
|
164
|
-
assert_equal reduced, full.
|
163
|
+
#see lang, full, reduced, full.reduce, full.reduce94
|
164
|
+
assert_equal reduced, full.reduce
|
165
165
|
end
|
166
166
|
end
|
167
167
|
|
168
168
|
def test_spaces
|
169
169
|
spaces = "\u0020\u00a0\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u202f\u205f\u3000\u2420\u2423"
|
170
|
-
assert_equal spaces.
|
170
|
+
assert_equal spaces.reduce, " " * spaces.length
|
171
171
|
assert_equal spaces.reduce94, " " * spaces.length
|
172
172
|
end
|
173
173
|
|
174
174
|
|
175
175
|
def test_minus_signs
|
176
176
|
minus = "\u00ac\u2212\u2010\u2011\u2012\u2013\u2014\u2015\u2500"
|
177
|
-
assert_equal minus.
|
177
|
+
assert_equal minus.reduce, "-" * minus.length
|
178
178
|
#assert_equal spaces.reduce94, " " * spaces.length
|
179
179
|
end
|
180
180
|
|
181
|
+
|
182
|
+
def test_preserve
|
183
|
+
# 0123456789012345678901234567890123456789
|
184
|
+
test = "ßàáâăäãāāāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰćĉčçċĆĈČÇĊďðđĎÐĐèéêěĕëēėęếÈÉÊĚĔËĒĖĘẾĝğġģĜĞĠĢĥħĤĦìíîĭïĩīıįijÌÍÎĬÏĨĪİĮIJĵĴķĶĺľłļŀĹĽŁĻĿńňñņʼnŋŃŇÑŅŊòóôŏöõōøőơœÒÓÔŎÖÕŌØŐƠŒŕřŗŔŘŖśŝšßşŚŜŠŞţťŧþŢŤŦÞùúûŭüũūůűųưÙÚÛŬÜŨŪŮŰŲƯŵŴýŷÿÝŶŸźżžŹŻŽ"
|
185
|
+
belassen = test[10..27]
|
186
|
+
exp = "ssaaaaaeaaaaåạąæảấầắằÀÁÂĂÄÃĀÅẠAAEAAAAAccccchCCCCChdddDDDeeeeeeeeeeEEEEEEEEEEggghgGGGhGhhHHiiiiiiiiiijIIIIIIIIIIJjJkKlllllLLLLLnnnnnnjNNNNNJoooooeooooooeOOOOOeOOOOOOErrrRRRssshsssSSShSttttTTTTuuuuueuuuuuuUUUUUeUUUUUUwWyyyYYYzzzhZZZh"
|
187
|
+
assert_equal exp, test.reduce(:preserve => belassen)
|
188
|
+
assert_raise ArgumentError do
|
189
|
+
belassen = test[10..28]
|
190
|
+
test.reduce(:preserve => belassen)
|
191
|
+
end
|
192
|
+
test = "Háâaäãaållo\nWelt"
|
193
|
+
assert_equal "Haaaäaaallo\nWelt", test.reduce( :preserve =>"äöüßÄÖÜ" )
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
def test_examples
|
200
|
+
assert_equal "Celine hoeren", "Céline hören".reduce
|
201
|
+
assert_equal "AeOeUeaeoeuess", "ÄÖÜäöüß".reduce
|
202
|
+
assert_equal "Celine hören 10EUR", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
|
203
|
+
assert_equal "Celine hören 10€", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true)
|
204
|
+
assert_equal "AOUaous", "ÄÖÜäöü߀".reduce( :fast => true )
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
def test_newlines_and_nonprintables
|
209
|
+
test = "Céli\x00ne\nhöre\x0c\x0e\x0fn"
|
210
|
+
assert_equal "Celine\nhören", test.reduce( :preserve => "ÄÖÜäöüß")
|
211
|
+
assert_equal "Celine\nhoeren", test.reduce
|
212
|
+
assert_equal "Celine\nhoren", test.reduce(:fast => true )
|
213
|
+
end
|
214
|
+
|
181
215
|
|
182
216
|
|
183
217
|
|
@@ -247,13 +281,7 @@ ENDOFSTRING
|
|
247
281
|
assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'z')
|
248
282
|
assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'Z')
|
249
283
|
assert_equal 'SCHEISSE', 'Scheiße'.reduce53(:german_sz => 'SS')
|
250
|
-
|
251
|
-
# geht vielleicht in Ruby 1.9
|
252
|
-
assert_equal 'Scheize', 'Scheiße'.reduce94(:german_sz => 'z')
|
253
|
-
assert_equal 'ScheiZe', 'Scheiße'.reduce94(:german_sz => 'Z')
|
254
|
-
assert_equal 'Scheisse', 'Scheiße'.reduce94(:german_sz => 'ss')
|
255
|
-
assert_equal 'Schei$e', 'Scheiße'.reduce94(:german_sz => '$')
|
256
|
-
assert_equal 'Schei$e', 'Schei$e'.reduce94
|
284
|
+
assert_equal 'Scheiß Arsche', 'Scheiß Ärsche'.reduce94(:preserve => 'ß')
|
257
285
|
end
|
258
286
|
|
259
287
|
|
data/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kyanite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-11-
|
12
|
+
date: 2012-11-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: drumherum
|