kyanite 0.8.0 → 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  == 0.8.0 2012-11-17
2
- * added String#to_ascii with human-like handling of unicode special characters
2
+ * added String#reduce with human-like handling of unicode special characters
3
3
 
4
4
  == 0.7.5 2012-11-14
5
5
  * added FSymbol class
@@ -45,56 +45,75 @@ class String
45
45
  end
46
46
 
47
47
 
48
- # Reduces the string to a ASCII encoding. Example:
48
+ # Reduces a rich unicode string to a very limited character set like humans do. Example:
49
+ # "Céline hören".reduce
50
+ # => "Celine hoeren"
51
+ #
52
+ # Handles all characters from ISO/IEC 8859-1 and CP1252
53
+ # like humans do, not just deleting the accents.
54
+ # So it's not a 1:1 translation, some unicode characters are translated to
55
+ # multible characters. Example:
56
+ # "ÄÖÜäöüß".reduce
57
+ # => "AeOeUeaeoeuess"
58
+ #
59
+ # For many unicode characters, this behaviour is based on +UnicodeUtils.nfkd+. Example:
49
60
  # ffi = "\uFB03"
50
61
  # ix = "\u2168"
51
62
  # high23="²³"
52
63
  # high5 = "\u2075"
53
64
  # all = ffi + ix + high23 + high5
54
- # all.to_ascii
65
+ # all.reduce
55
66
  # => "ffiIX235"
56
67
  #
57
- # Based on +UnicodeUtils.nfkd+, but handles all characters from ISO/IEC 8859-1 and CP1252
58
- # like humans do, not just deleting the accents. Example:
59
- # "ÄÖÜäöüß".to_ascii
60
- # => "AeOeUeaeoeuess"
68
+ # You can preserve some characters, e.g. all special characters of a specific language. Example:
69
+ # "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
70
+ # => "Celine hören 10EUR"
71
+ #
72
+ # Newlines are preserved by default, but all other nonprintable ascii characters below \\x20 are removed.
61
73
  #
62
- # 1. Converts ÄÖÜäöüßàáâăäãāåạąæảấầắằ etc. to AeOeUeaeoeuessaaaaaaaaaaaaaaaa.
63
- # 2. Then removes all non-Ascii-chars.
64
- # 3. Then removes all non-printable Ascii-chars.
65
- # 4. Caution: Also Newlines are removed.
66
- # About 10 times slower than {#reduce94 reduce94}, but more accurate.
74
+ # There is also a fast mode. It's about 10 times faster, but it supports only 1:1 translation.
75
+ # "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true )
76
+ # => "Celine hören 10€"
77
+ #
78
+ # "ÄÖÜäöü߀".reduce( :fast => true )
79
+ # => "AOUaous"
80
+ #
81
+ # Your result will only contain these characters:
82
+ # * printable letters and basic symbols of the 7bit ASCII charset (\\x20..\\x7e)
83
+ # * preserved characters as defined in the options (max 18)
84
+ # * newlines (\\x0a and \\x0d)
67
85
  #
68
- def to_ascii
69
- result = self.to_ascii_extra_chars
70
- result.tr!(TR_FULL, TR_REDUCED) # not necessary, only for performance
71
- return UnicodeUtils.nfkd(result).delete('^ -~') # delete is faster than gsub
86
+ # Options:
87
+ # [:preserve] Special characters to preserve. You can only preserve up to 18 characters.
88
+ # [:fast] Fast mode, if true. About 10 times faster, but it supports only 1:1 translation.
89
+ #
90
+ # @return [String]
91
+ def reduce( options ={} )
92
+ preserve = options[:preserve] || ''
93
+ raise ArgumentError, 'max preserve string length is 18 chars' if preserve.length > 18
94
+
95
+ result = self.delete("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e-\x1f")
96
+ result.tr!(preserve, "\x0e-\x1f") if preserve.length > 0
97
+
98
+ result = result.to_ascii_extra_chars unless options[:fast]
99
+ result.tr!(TR_FULL, TR_REDUCED)
100
+ result = UnicodeUtils.nfkd(result) unless options[:fast]
101
+
102
+ result.delete!("^\x09-\x7e")
103
+ result.tr!("\x0e-\x1f", preserve) if preserve.length > 0
104
+ result
72
105
  end
73
106
 
74
107
 
75
108
 
76
- # Reduces the string to a base94 encoding.
77
- # About 10 times faster than with +UnicodeUtils+.
78
- # 1. Converts àáâăäãāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰ etc. to aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAA.
79
- # 2. Then removes all non-Ascii-chars.
80
- # 3. Then removes all non-printable Ascii-chars.
81
- # 4. Caution: Also Newlines are removed.
82
- #
83
- # See tests and examples {TestKyaniteStringChars#test_reduce94_a here}.
109
+ # @deprecated
84
110
  # @return [String]
85
111
  def reduce94( options={} )
86
- dup.reduce94!(options)
112
+ reduce( {:fast => true}.merge(options) )
87
113
  end
88
114
 
89
115
 
90
- # In-place-variant of {#reduce94 reduce94}.
91
- # @return [String]
92
- def reduce94!( options={} )
93
- self.gsub!( 'ß', options[:german_sz] ) if options[:german_sz]
94
- self.tr!(TR_FULL, TR_REDUCED)
95
- self.delete!('^ -~')
96
- self
97
- end
116
+
98
117
 
99
118
  # Reduziert den String auf ein Base53-Encoding,
100
119
  # bestehend aus Großbuchstaben, Minuszeichen und zu Kleinbuchstaben umgeformten Sonderzeichen.
@@ -16,12 +16,12 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
16
16
  klammer_auf = "\u227a\u226a\u3008\u276c\u2329\u25c1\u25c0"
17
17
  klammer_zu = "\u227b\u226b\u3009\u276d\u232a\u25b7\u25b6"
18
18
 
19
- # Sowohl reduce94 als auch to_ascii werden diese Zeichen übersetzen.
19
+ # Sowohl reduce94 als auch reduce werden diese Zeichen übersetzen.
20
20
  # Zeichen, die TR_FULL ergänzen und die UnicodeUtils.nfkd nicht korrekt umsetzt.
21
21
  tr_full_b = %q{£₤¢‹¥›•«×»÷‚‘ƒ’ˆ§´¡„¿“¦”†‡µ′″°¤∗·⋅} + leerzeichen + klammer_auf + klammer_zu
22
22
  tr_reduced_b = %q{LLc"Y"*"*"/''f'^P'!"?"|"~~u'"~~***} + (" "*leerzeichen.length) + ("<"*klammer_auf.length) + (">"*klammer_zu.length)
23
23
 
24
- # Nur to_ascii wird diese Zeichen übersetzen.
24
+ # Nur reduce wird diese Zeichen übersetzen.
25
25
  # Zeichen, die in TR_FULL schon drin sind und die UnicodeUtils.nfkd nicht korrekt umsetzt
26
26
  tr_full_c = %q{ØøðđÐĐħĦıĸłŁŧþŦÞаАбБцчЦЧдДеэЕЭфФгГхХийИЙюяЮЯкКлЛмМнНоОпПрРсшщСШЩтТуУвВжзЖЗ}
27
27
  tr_reduced_c = %q{OoddDDhHiklLttTTaAbBccCCdDeeEEfFgGhHiiIIjjJJkKlLmMnNoOpPrRsssSSStTuUvVzzZZ}
@@ -29,7 +29,7 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
29
29
 
30
30
 
31
31
 
32
- # Nur to_ascii wird diese Zeichen übersetzen.
32
+ # Nur reduce wird diese Zeichen übersetzen.
33
33
  TR_EXTRA_CHARS = [
34
34
  [/ß/, 'ss'],
35
35
  [/Ö/, 'Oe'],
@@ -267,7 +267,7 @@ if $0 == __FILE__ then
267
267
  see "Überprüfe TR_EXTRA_CHARS"
268
268
  see "========================"
269
269
  see
270
- see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "to_ascii", "Klassifizierung"
270
+ see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
271
271
  startline = 14
272
272
  i = 0
273
273
  all = ""
@@ -280,7 +280,7 @@ if $0 == __FILE__ then
280
280
  c.to_array_of_hex, # sein Code in HEX
281
281
  c, # das Zeichen
282
282
  c.reduce94, # was reduce94 daraus macht
283
- c.to_ascii, # was to_ascii daraus macht
283
+ c.reduce, # was reduce daraus macht
284
284
  UnicodeUtils.char_type(c)
285
285
 
286
286
  i+=1
@@ -293,7 +293,7 @@ if $0 == __FILE__ then
293
293
  see "Überprüfe TR_FULL"
294
294
  see "================="
295
295
  see
296
- see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "to_ascii", "Klassifizierung"
296
+ see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
297
297
  i = 0
298
298
  all = ""
299
299
  #TR_FULL_TO_ASCII.each_char do |c|
@@ -305,7 +305,7 @@ if $0 == __FILE__ then
305
305
  c.to_array_of_hex, # sein Code in HEX
306
306
  c, # das Zeichen
307
307
  c.reduce94, # was reduce94 daraus macht
308
- c.to_ascii, # was to_ascii daraus macht
308
+ c.reduce, # was reduce daraus macht
309
309
  UnicodeUtils.char_type(c)
310
310
 
311
311
  i+=1
@@ -44,7 +44,7 @@ class TestKyaniteStringChars < UnitTest
44
44
  assert_equal 0, all.to_a.to_set.size-i-1, "TR_FULL: Dup in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
45
45
  assert c.to_array_of_codepoints[0] > 127, "TR_FULL: Trivialität in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
46
46
  assert r.to_array_of_codepoints[0] <= 127, "TR_FULL: Zeichen Nr. #{i} Zeichen #{c} >> #{r} wird nicht in ASCII umgesetzt"
47
- assert_equal c.reduce94, c.to_ascii[0]
47
+ assert_equal c.reduce94, c.reduce[0]
48
48
  i+=1
49
49
  end
50
50
  end
@@ -87,7 +87,7 @@ class TestKyaniteStringChars < UnitTest
87
87
  full = 'ªàáâăãāåạąảấầắằÀÁÂĂÃĀÅẠĄẢẤẦẮẰ'
88
88
  reduced = 'aaaaaaaaaaaaaaaAAAAAAAAAAAAAA'
89
89
  assert_equal reduced, full.reduce94
90
- assert_equal reduced, full.to_ascii
90
+ assert_equal reduced, full.reduce
91
91
  end
92
92
 
93
93
  def test_to_ascii_b
@@ -95,7 +95,7 @@ class TestKyaniteStringChars < UnitTest
95
95
  reduced1 = 'cccccCCCCCdDeeeeeeeeeeEEEEEEEEEE'
96
96
  reduced2 = 'ccccchCCCCChdDeeeeeeeeeeEEEEEEEEEE'
97
97
  assert_equal reduced1, full.reduce94
98
- assert_equal reduced2, full.to_ascii
98
+ assert_equal reduced2, full.reduce
99
99
  end
100
100
 
101
101
  def test_to_ascii_c
@@ -103,14 +103,14 @@ class TestKyaniteStringChars < UnitTest
103
103
  reduced1 = 'ggggGGGGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
104
104
  reduced2 = 'ggghgGGGhGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
105
105
  assert_equal reduced1, full.reduce94
106
- assert_equal reduced2, full.to_ascii
106
+ assert_equal reduced2, full.reduce
107
107
  end
108
108
 
109
109
  def test_to_ascii_e
110
110
  full = 'ńňñņʼnŃŇÑŅòóôŏõōőơÒÓÔŎÕŌŐƠ'
111
111
  reduced = 'nnnnnNNNNooooooooOOOOOOOO'
112
112
  assert_equal reduced, full.reduce94
113
- assert_equal reduced, full.to_ascii
113
+ assert_equal reduced, full.reduce
114
114
  end
115
115
 
116
116
  def test_to_ascii_f
@@ -118,24 +118,24 @@ class TestKyaniteStringChars < UnitTest
118
118
  reduced1 = 'rrrRRRssssSSSSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzZZZ'
119
119
  reduced2 = 'rrrRRRssshsSSShSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzhZZZh'
120
120
  assert_equal reduced1, full.reduce94
121
- assert_equal reduced2, full.to_ascii
121
+ assert_equal reduced2, full.reduce
122
122
  end
123
123
 
124
124
  def test_to_ascii_zusammengesetzt
125
125
  full = 'ijIJſ…'
126
126
  reduced = 'ijIJs...'
127
- assert_equal reduced, full.to_ascii
127
+ assert_equal reduced, full.reduce
128
128
  end
129
129
 
130
130
  def test_to_ascii_same_same
131
131
  same_same = '^!"$%&/()=?@*+~#<>|,;:.-_ {[]}\\'
132
- assert_equal same_same, same_same.to_ascii
132
+ assert_equal same_same, same_same.reduce
133
133
  same_same = "'0123456789"
134
- assert_equal same_same, same_same.to_ascii
134
+ assert_equal same_same, same_same.reduce
135
135
  same_same = 'abcdefghijklmnopqrstuvwxyz'
136
- assert_equal same_same, same_same.to_ascii
136
+ assert_equal same_same, same_same.reduce
137
137
  same_same = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
138
- assert_equal same_same, same_same.to_ascii
138
+ assert_equal same_same, same_same.reduce
139
139
  end
140
140
 
141
141
 
@@ -143,7 +143,7 @@ class TestKyaniteStringChars < UnitTest
143
143
  full = '¯¨'
144
144
  reduced = ' ' * full.length
145
145
  assert_equal 2, full.length
146
- assert_equal reduced, full.to_ascii
146
+ assert_equal reduced, full.reduce
147
147
  end
148
148
 
149
149
  def test_to_ascii_s
@@ -155,29 +155,63 @@ class TestKyaniteStringChars < UnitTest
155
155
  reduced1 = "sOUAoua"
156
156
  reduced2 = "ffiIX235EURssOeUeAeoeueae"
157
157
  assert_equal reduced1, full.reduce94
158
- assert_equal reduced2, full.to_ascii
158
+ assert_equal reduced2, full.reduce
159
159
  end
160
160
 
161
161
  def test_LANG_SPECIAL_CHARS
162
162
  LANG_SPECIAL_CHARS .each do | lang, (full, reduced) |
163
- #see lang, full, reduced, full.to_ascii, full.reduce94
164
- assert_equal reduced, full.to_ascii
163
+ #see lang, full, reduced, full.reduce, full.reduce94
164
+ assert_equal reduced, full.reduce
165
165
  end
166
166
  end
167
167
 
168
168
  def test_spaces
169
169
  spaces = "\u0020\u00a0\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u202f\u205f\u3000\u2420\u2423"
170
- assert_equal spaces.to_ascii, " " * spaces.length
170
+ assert_equal spaces.reduce, " " * spaces.length
171
171
  assert_equal spaces.reduce94, " " * spaces.length
172
172
  end
173
173
 
174
174
 
175
175
  def test_minus_signs
176
176
  minus = "\u00ac\u2212\u2010\u2011\u2012\u2013\u2014\u2015\u2500"
177
- assert_equal minus.to_ascii, "-" * minus.length
177
+ assert_equal minus.reduce, "-" * minus.length
178
178
  #assert_equal spaces.reduce94, " " * spaces.length
179
179
  end
180
180
 
181
+
182
+ def test_preserve
183
+ # 0123456789012345678901234567890123456789
184
+ test = "ßàáâăäãāāāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰćĉčçċĆĈČÇĊďðđĎÐĐèéêěĕëēėęếÈÉÊĚĔËĒĖĘẾĝğġģĜĞĠĢĥħĤĦìíîĭïĩīıįijÌÍÎĬÏĨĪİĮIJĵĴķĶĺľłļŀĹĽŁĻĿńňñņʼnŋŃŇÑŅŊòóôŏöõōøőơœÒÓÔŎÖÕŌØŐƠŒŕřŗŔŘŖśŝšßşŚŜŠŞţťŧþŢŤŦÞùúûŭüũūůűųưÙÚÛŬÜŨŪŮŰŲƯŵŴýŷÿÝŶŸźżžŹŻŽ"
185
+ belassen = test[10..27]
186
+ exp = "ssaaaaaeaaaaåạąæảấầắằÀÁÂĂÄÃĀÅẠAAEAAAAAccccchCCCCChdddDDDeeeeeeeeeeEEEEEEEEEEggghgGGGhGhhHHiiiiiiiiiijIIIIIIIIIIJjJkKlllllLLLLLnnnnnnjNNNNNJoooooeooooooeOOOOOeOOOOOOErrrRRRssshsssSSShSttttTTTTuuuuueuuuuuuUUUUUeUUUUUUwWyyyYYYzzzhZZZh"
187
+ assert_equal exp, test.reduce(:preserve => belassen)
188
+ assert_raise ArgumentError do
189
+ belassen = test[10..28]
190
+ test.reduce(:preserve => belassen)
191
+ end
192
+ test = "Háâaäãaållo\nWelt"
193
+ assert_equal "Haaaäaaallo\nWelt", test.reduce( :preserve =>"äöüßÄÖÜ" )
194
+
195
+ end
196
+
197
+
198
+
199
+ def test_examples
200
+ assert_equal "Celine hoeren", "Céline hören".reduce
201
+ assert_equal "AeOeUeaeoeuess", "ÄÖÜäöüß".reduce
202
+ assert_equal "Celine hören 10EUR", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
203
+ assert_equal "Celine hören 10€", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true)
204
+ assert_equal "AOUaous", "ÄÖÜäöü߀".reduce( :fast => true )
205
+ end
206
+
207
+
208
+ def test_newlines_and_nonprintables
209
+ test = "Céli\x00ne\nhöre\x0c\x0e\x0fn"
210
+ assert_equal "Celine\nhören", test.reduce( :preserve => "ÄÖÜäöüß")
211
+ assert_equal "Celine\nhoeren", test.reduce
212
+ assert_equal "Celine\nhoren", test.reduce(:fast => true )
213
+ end
214
+
181
215
 
182
216
 
183
217
 
@@ -247,13 +281,7 @@ ENDOFSTRING
247
281
  assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'z')
248
282
  assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'Z')
249
283
  assert_equal 'SCHEISSE', 'Scheiße'.reduce53(:german_sz => 'SS')
250
-
251
- # geht vielleicht in Ruby 1.9
252
- assert_equal 'Scheize', 'Scheiße'.reduce94(:german_sz => 'z')
253
- assert_equal 'ScheiZe', 'Scheiße'.reduce94(:german_sz => 'Z')
254
- assert_equal 'Scheisse', 'Scheiße'.reduce94(:german_sz => 'ss')
255
- assert_equal 'Schei$e', 'Scheiße'.reduce94(:german_sz => '$')
256
- assert_equal 'Schei$e', 'Schei$e'.reduce94
284
+ assert_equal 'Scheiß Arsche', 'Scheiß Ärsche'.reduce94(:preserve => 'ß')
257
285
  end
258
286
 
259
287
 
data/version.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Kyanite
4
4
 
5
- VERSION = '0.8.0'
5
+ VERSION = '0.8.1'
6
6
 
7
7
  end
8
8
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kyanite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-17 00:00:00.000000000 Z
12
+ date: 2012-11-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: drumherum