kyanite 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  == 0.8.0 2012-11-17
2
- * added String#to_ascii with human-like handling of unicode special characters
2
+ * added String#reduce with human-like handling of unicode special characters
3
3
 
4
4
  == 0.7.5 2012-11-14
5
5
  * added FSymbol class
@@ -45,56 +45,75 @@ class String
45
45
  end
46
46
 
47
47
 
48
- # Reduces the string to a ASCII encoding. Example:
48
+ # Reduces a rich unicode string to a very limited character set like humans do. Example:
49
+ # "Céline hören".reduce
50
+ # => "Celine hoeren"
51
+ #
52
+ # Handles all characters from ISO/IEC 8859-1 and CP1252
53
+ # like humans do, not just deleting the accents.
54
+ # So it's not a 1:1 translation, some unicode characters are translated to
55
+ # multible characters. Example:
56
+ # "ÄÖÜäöüß".reduce
57
+ # => "AeOeUeaeoeuess"
58
+ #
59
+ # For many unicode characters, this behaviour is based on +UnicodeUtils.nfkd+. Example:
49
60
  # ffi = "\uFB03"
50
61
  # ix = "\u2168"
51
62
  # high23="²³"
52
63
  # high5 = "\u2075"
53
64
  # all = ffi + ix + high23 + high5
54
- # all.to_ascii
65
+ # all.reduce
55
66
  # => "ffiIX235"
56
67
  #
57
- # Based on +UnicodeUtils.nfkd+, but handles all characters from ISO/IEC 8859-1 and CP1252
58
- # like humans do, not just deleting the accents. Example:
59
- # "ÄÖÜäöüß".to_ascii
60
- # => "AeOeUeaeoeuess"
68
+ # You can preserve some characters, e.g. all special characters of a specific language. Example:
69
+ # "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
70
+ # => "Celine hören 10EUR"
71
+ #
72
+ # Newlines are preserved by default, but all other nonprintable ascii characters below \\x20 are removed.
61
73
  #
62
- # 1. Converts ÄÖÜäöüßàáâăäãāåạąæảấầắằ etc. to AeOeUeaeoeuessaaaaaaaaaaaaaaaa.
63
- # 2. Then removes all non-Ascii-chars.
64
- # 3. Then removes all non-printable Ascii-chars.
65
- # 4. Caution: Also Newlines are removed.
66
- # About 10 times slower than {#reduce94 reduce94}, but more accurate.
74
+ # There is also a fast mode. It's about 10 times faster, but it supports only 1:1 translation.
75
+ # "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true )
76
+ # => "Celine hören 10€"
77
+ #
78
+ # "ÄÖÜäöü߀".reduce( :fast => true )
79
+ # => "AOUaous"
80
+ #
81
+ # Your result will only contain these characters:
82
+ # * printable letters and basic symbols of the 7bit ASCII charset (\\x20..\\x7e)
83
+ # * preserved characters as defined in the options (max 18)
84
+ # * newlines (\\x0a and \\x0d)
67
85
  #
68
- def to_ascii
69
- result = self.to_ascii_extra_chars
70
- result.tr!(TR_FULL, TR_REDUCED) # not necessary, only for performance
71
- return UnicodeUtils.nfkd(result).delete('^ -~') # delete is faster than gsub
86
+ # Options:
87
+ # [:preserve] Special characters to preserve. You can only preserve up to 18 characters.
88
+ # [:fast] Fast mode, if true. About 10 times faster, but it supports only 1:1 translation.
89
+ #
90
+ # @return [String]
91
+ def reduce( options ={} )
92
+ preserve = options[:preserve] || ''
93
+ raise ArgumentError, 'max preserve string length is 18 chars' if preserve.length > 18
94
+
95
+ result = self.delete("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e-\x1f")
96
+ result.tr!(preserve, "\x0e-\x1f") if preserve.length > 0
97
+
98
+ result = result.to_ascii_extra_chars unless options[:fast]
99
+ result.tr!(TR_FULL, TR_REDUCED)
100
+ result = UnicodeUtils.nfkd(result) unless options[:fast]
101
+
102
+ result.delete!("^\x09-\x7e")
103
+ result.tr!("\x0e-\x1f", preserve) if preserve.length > 0
104
+ result
72
105
  end
73
106
 
74
107
 
75
108
 
76
- # Reduces the string to a base94 encoding.
77
- # About 10 times faster than with +UnicodeUtils+.
78
- # 1. Converts àáâăäãāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰ etc. to aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAA.
79
- # 2. Then removes all non-Ascii-chars.
80
- # 3. Then removes all non-printable Ascii-chars.
81
- # 4. Caution: Also Newlines are removed.
82
- #
83
- # See tests and examples {TestKyaniteStringChars#test_reduce94_a here}.
109
+ # @deprecated
84
110
  # @return [String]
85
111
  def reduce94( options={} )
86
- dup.reduce94!(options)
112
+ reduce( {:fast => true}.merge(options) )
87
113
  end
88
114
 
89
115
 
90
- # In-place-variant of {#reduce94 reduce94}.
91
- # @return [String]
92
- def reduce94!( options={} )
93
- self.gsub!( 'ß', options[:german_sz] ) if options[:german_sz]
94
- self.tr!(TR_FULL, TR_REDUCED)
95
- self.delete!('^ -~')
96
- self
97
- end
116
+
98
117
 
99
118
  # Reduziert den String auf ein Base53-Encoding,
100
119
  # bestehend aus Großbuchstaben, Minuszeichen und zu Kleinbuchstaben umgeformten Sonderzeichen.
@@ -16,12 +16,12 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
16
16
  klammer_auf = "\u227a\u226a\u3008\u276c\u2329\u25c1\u25c0"
17
17
  klammer_zu = "\u227b\u226b\u3009\u276d\u232a\u25b7\u25b6"
18
18
 
19
- # Sowohl reduce94 als auch to_ascii werden diese Zeichen übersetzen.
19
+ # Sowohl reduce94 als auch reduce werden diese Zeichen übersetzen.
20
20
  # Zeichen, die TR_FULL ergänzen und die UnicodeUtils.nfkd nicht korrekt umsetzt.
21
21
  tr_full_b = %q{£₤¢‹¥›•«×»÷‚‘ƒ’ˆ§´¡„¿“¦”†‡µ′″°¤∗·⋅} + leerzeichen + klammer_auf + klammer_zu
22
22
  tr_reduced_b = %q{LLc"Y"*"*"/''f'^P'!"?"|"~~u'"~~***} + (" "*leerzeichen.length) + ("<"*klammer_auf.length) + (">"*klammer_zu.length)
23
23
 
24
- # Nur to_ascii wird diese Zeichen übersetzen.
24
+ # Nur reduce wird diese Zeichen übersetzen.
25
25
  # Zeichen, die in TR_FULL schon drin sind und die UnicodeUtils.nfkd nicht korrekt umsetzt
26
26
  tr_full_c = %q{ØøðđÐĐħĦıĸłŁŧþŦÞаАбБцчЦЧдДеэЕЭфФгГхХийИЙюяЮЯкКлЛмМнНоОпПрРсшщСШЩтТуУвВжзЖЗ}
27
27
  tr_reduced_c = %q{OoddDDhHiklLttTTaAbBccCCdDeeEEfFgGhHiiIIjjJJkKlLmMnNoOpPrRsssSSStTuUvVzzZZ}
@@ -29,7 +29,7 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
29
29
 
30
30
 
31
31
 
32
- # Nur to_ascii wird diese Zeichen übersetzen.
32
+ # Nur reduce wird diese Zeichen übersetzen.
33
33
  TR_EXTRA_CHARS = [
34
34
  [/ß/, 'ss'],
35
35
  [/Ö/, 'Oe'],
@@ -267,7 +267,7 @@ if $0 == __FILE__ then
267
267
  see "Überprüfe TR_EXTRA_CHARS"
268
268
  see "========================"
269
269
  see
270
- see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "to_ascii", "Klassifizierung"
270
+ see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
271
271
  startline = 14
272
272
  i = 0
273
273
  all = ""
@@ -280,7 +280,7 @@ if $0 == __FILE__ then
280
280
  c.to_array_of_hex, # sein Code in HEX
281
281
  c, # das Zeichen
282
282
  c.reduce94, # was reduce94 daraus macht
283
- c.to_ascii, # was to_ascii daraus macht
283
+ c.reduce, # was reduce daraus macht
284
284
  UnicodeUtils.char_type(c)
285
285
 
286
286
  i+=1
@@ -293,7 +293,7 @@ if $0 == __FILE__ then
293
293
  see "Überprüfe TR_FULL"
294
294
  see "================="
295
295
  see
296
- see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "to_ascii", "Klassifizierung"
296
+ see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
297
297
  i = 0
298
298
  all = ""
299
299
  #TR_FULL_TO_ASCII.each_char do |c|
@@ -305,7 +305,7 @@ if $0 == __FILE__ then
305
305
  c.to_array_of_hex, # sein Code in HEX
306
306
  c, # das Zeichen
307
307
  c.reduce94, # was reduce94 daraus macht
308
- c.to_ascii, # was to_ascii daraus macht
308
+ c.reduce, # was reduce daraus macht
309
309
  UnicodeUtils.char_type(c)
310
310
 
311
311
  i+=1
@@ -44,7 +44,7 @@ class TestKyaniteStringChars < UnitTest
44
44
  assert_equal 0, all.to_a.to_set.size-i-1, "TR_FULL: Dup in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
45
45
  assert c.to_array_of_codepoints[0] > 127, "TR_FULL: Trivialität in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
46
46
  assert r.to_array_of_codepoints[0] <= 127, "TR_FULL: Zeichen Nr. #{i} Zeichen #{c} >> #{r} wird nicht in ASCII umgesetzt"
47
- assert_equal c.reduce94, c.to_ascii[0]
47
+ assert_equal c.reduce94, c.reduce[0]
48
48
  i+=1
49
49
  end
50
50
  end
@@ -87,7 +87,7 @@ class TestKyaniteStringChars < UnitTest
87
87
  full = 'ªàáâăãāåạąảấầắằÀÁÂĂÃĀÅẠĄẢẤẦẮẰ'
88
88
  reduced = 'aaaaaaaaaaaaaaaAAAAAAAAAAAAAA'
89
89
  assert_equal reduced, full.reduce94
90
- assert_equal reduced, full.to_ascii
90
+ assert_equal reduced, full.reduce
91
91
  end
92
92
 
93
93
  def test_to_ascii_b
@@ -95,7 +95,7 @@ class TestKyaniteStringChars < UnitTest
95
95
  reduced1 = 'cccccCCCCCdDeeeeeeeeeeEEEEEEEEEE'
96
96
  reduced2 = 'ccccchCCCCChdDeeeeeeeeeeEEEEEEEEEE'
97
97
  assert_equal reduced1, full.reduce94
98
- assert_equal reduced2, full.to_ascii
98
+ assert_equal reduced2, full.reduce
99
99
  end
100
100
 
101
101
  def test_to_ascii_c
@@ -103,14 +103,14 @@ class TestKyaniteStringChars < UnitTest
103
103
  reduced1 = 'ggggGGGGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
104
104
  reduced2 = 'ggghgGGGhGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
105
105
  assert_equal reduced1, full.reduce94
106
- assert_equal reduced2, full.to_ascii
106
+ assert_equal reduced2, full.reduce
107
107
  end
108
108
 
109
109
  def test_to_ascii_e
110
110
  full = 'ńňñņʼnŃŇÑŅòóôŏõōőơÒÓÔŎÕŌŐƠ'
111
111
  reduced = 'nnnnnNNNNooooooooOOOOOOOO'
112
112
  assert_equal reduced, full.reduce94
113
- assert_equal reduced, full.to_ascii
113
+ assert_equal reduced, full.reduce
114
114
  end
115
115
 
116
116
  def test_to_ascii_f
@@ -118,24 +118,24 @@ class TestKyaniteStringChars < UnitTest
118
118
  reduced1 = 'rrrRRRssssSSSSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzZZZ'
119
119
  reduced2 = 'rrrRRRssshsSSShSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzhZZZh'
120
120
  assert_equal reduced1, full.reduce94
121
- assert_equal reduced2, full.to_ascii
121
+ assert_equal reduced2, full.reduce
122
122
  end
123
123
 
124
124
  def test_to_ascii_zusammengesetzt
125
125
  full = 'ijIJſ…'
126
126
  reduced = 'ijIJs...'
127
- assert_equal reduced, full.to_ascii
127
+ assert_equal reduced, full.reduce
128
128
  end
129
129
 
130
130
  def test_to_ascii_same_same
131
131
  same_same = '^!"$%&/()=?@*+~#<>|,;:.-_ {[]}\\'
132
- assert_equal same_same, same_same.to_ascii
132
+ assert_equal same_same, same_same.reduce
133
133
  same_same = "'0123456789"
134
- assert_equal same_same, same_same.to_ascii
134
+ assert_equal same_same, same_same.reduce
135
135
  same_same = 'abcdefghijklmnopqrstuvwxyz'
136
- assert_equal same_same, same_same.to_ascii
136
+ assert_equal same_same, same_same.reduce
137
137
  same_same = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
138
- assert_equal same_same, same_same.to_ascii
138
+ assert_equal same_same, same_same.reduce
139
139
  end
140
140
 
141
141
 
@@ -143,7 +143,7 @@ class TestKyaniteStringChars < UnitTest
143
143
  full = '¯¨'
144
144
  reduced = ' ' * full.length
145
145
  assert_equal 2, full.length
146
- assert_equal reduced, full.to_ascii
146
+ assert_equal reduced, full.reduce
147
147
  end
148
148
 
149
149
  def test_to_ascii_s
@@ -155,29 +155,63 @@ class TestKyaniteStringChars < UnitTest
155
155
  reduced1 = "sOUAoua"
156
156
  reduced2 = "ffiIX235EURssOeUeAeoeueae"
157
157
  assert_equal reduced1, full.reduce94
158
- assert_equal reduced2, full.to_ascii
158
+ assert_equal reduced2, full.reduce
159
159
  end
160
160
 
161
161
  def test_LANG_SPECIAL_CHARS
162
162
  LANG_SPECIAL_CHARS .each do | lang, (full, reduced) |
163
- #see lang, full, reduced, full.to_ascii, full.reduce94
164
- assert_equal reduced, full.to_ascii
163
+ #see lang, full, reduced, full.reduce, full.reduce94
164
+ assert_equal reduced, full.reduce
165
165
  end
166
166
  end
167
167
 
168
168
  def test_spaces
169
169
  spaces = "\u0020\u00a0\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u202f\u205f\u3000\u2420\u2423"
170
- assert_equal spaces.to_ascii, " " * spaces.length
170
+ assert_equal spaces.reduce, " " * spaces.length
171
171
  assert_equal spaces.reduce94, " " * spaces.length
172
172
  end
173
173
 
174
174
 
175
175
  def test_minus_signs
176
176
  minus = "\u00ac\u2212\u2010\u2011\u2012\u2013\u2014\u2015\u2500"
177
- assert_equal minus.to_ascii, "-" * minus.length
177
+ assert_equal minus.reduce, "-" * minus.length
178
178
  #assert_equal spaces.reduce94, " " * spaces.length
179
179
  end
180
180
 
181
+
182
+ def test_preserve
183
+ # 0123456789012345678901234567890123456789
184
+ test = "ßàáâăäãāāāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰćĉčçċĆĈČÇĊďðđĎÐĐèéêěĕëēėęếÈÉÊĚĔËĒĖĘẾĝğġģĜĞĠĢĥħĤĦìíîĭïĩīıįijÌÍÎĬÏĨĪİĮIJĵĴķĶĺľłļŀĹĽŁĻĿńňñņʼnŋŃŇÑŅŊòóôŏöõōøőơœÒÓÔŎÖÕŌØŐƠŒŕřŗŔŘŖśŝšßşŚŜŠŞţťŧþŢŤŦÞùúûŭüũūůűųưÙÚÛŬÜŨŪŮŰŲƯŵŴýŷÿÝŶŸźżžŹŻŽ"
185
+ belassen = test[10..27]
186
+ exp = "ssaaaaaeaaaaåạąæảấầắằÀÁÂĂÄÃĀÅẠAAEAAAAAccccchCCCCChdddDDDeeeeeeeeeeEEEEEEEEEEggghgGGGhGhhHHiiiiiiiiiijIIIIIIIIIIJjJkKlllllLLLLLnnnnnnjNNNNNJoooooeooooooeOOOOOeOOOOOOErrrRRRssshsssSSShSttttTTTTuuuuueuuuuuuUUUUUeUUUUUUwWyyyYYYzzzhZZZh"
187
+ assert_equal exp, test.reduce(:preserve => belassen)
188
+ assert_raise ArgumentError do
189
+ belassen = test[10..28]
190
+ test.reduce(:preserve => belassen)
191
+ end
192
+ test = "Háâaäãaållo\nWelt"
193
+ assert_equal "Haaaäaaallo\nWelt", test.reduce( :preserve =>"äöüßÄÖÜ" )
194
+
195
+ end
196
+
197
+
198
+
199
+ def test_examples
200
+ assert_equal "Celine hoeren", "Céline hören".reduce
201
+ assert_equal "AeOeUeaeoeuess", "ÄÖÜäöüß".reduce
202
+ assert_equal "Celine hören 10EUR", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
203
+ assert_equal "Celine hören 10€", "Céline hören 10€".reduce( :preserve => "ÄÖÜäöü߀", :fast => true)
204
+ assert_equal "AOUaous", "ÄÖÜäöü߀".reduce( :fast => true )
205
+ end
206
+
207
+
208
+ def test_newlines_and_nonprintables
209
+ test = "Céli\x00ne\nhöre\x0c\x0e\x0fn"
210
+ assert_equal "Celine\nhören", test.reduce( :preserve => "ÄÖÜäöüß")
211
+ assert_equal "Celine\nhoeren", test.reduce
212
+ assert_equal "Celine\nhoren", test.reduce(:fast => true )
213
+ end
214
+
181
215
 
182
216
 
183
217
 
@@ -247,13 +281,7 @@ ENDOFSTRING
247
281
  assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'z')
248
282
  assert_equal 'SCHEIZE', 'Scheiße'.reduce53(:german_sz => 'Z')
249
283
  assert_equal 'SCHEISSE', 'Scheiße'.reduce53(:german_sz => 'SS')
250
-
251
- # geht vielleicht in Ruby 1.9
252
- assert_equal 'Scheize', 'Scheiße'.reduce94(:german_sz => 'z')
253
- assert_equal 'ScheiZe', 'Scheiße'.reduce94(:german_sz => 'Z')
254
- assert_equal 'Scheisse', 'Scheiße'.reduce94(:german_sz => 'ss')
255
- assert_equal 'Schei$e', 'Scheiße'.reduce94(:german_sz => '$')
256
- assert_equal 'Schei$e', 'Schei$e'.reduce94
284
+ assert_equal 'Scheiß Arsche', 'Scheiß Ärsche'.reduce94(:preserve => 'ß')
257
285
  end
258
286
 
259
287
 
data/version.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Kyanite
4
4
 
5
- VERSION = '0.8.0'
5
+ VERSION = '0.8.1'
6
6
 
7
7
  end
8
8
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kyanite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-17 00:00:00.000000000 Z
12
+ date: 2012-11-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: drumherum