activesupport-inflector 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+
3
+ module ActiveSupport #:nodoc:
4
+ module Multibyte #:nodoc:
5
+ # Raised when a problem with the encoding was found.
6
+ class EncodingError < StandardError; end
7
+ end
8
+ end
@@ -0,0 +1,393 @@
1
+ # encoding: utf-8
2
+ module ActiveSupport
3
+ module Multibyte
4
+ module Unicode
5
+
6
+ extend self
7
+
8
+ # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
9
+ # information about normalization.
10
+ NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
11
+
12
+ # The Unicode version that is supported by the implementation
13
+ UNICODE_VERSION = '5.2.0'
14
+
15
+ # The default normalization used for operations that require normalization. It can be set to any of the
16
+ # normalizations in NORMALIZATION_FORMS.
17
+ #
18
+ # Example:
19
+ # ActiveSupport::Multibyte::Unicode.default_normalization_form = :c
20
+ attr_accessor :default_normalization_form
21
+ @default_normalization_form = :kc
22
+
23
+ # Hangul character boundaries and properties
24
+ HANGUL_SBASE = 0xAC00
25
+ HANGUL_LBASE = 0x1100
26
+ HANGUL_VBASE = 0x1161
27
+ HANGUL_TBASE = 0x11A7
28
+ HANGUL_LCOUNT = 19
29
+ HANGUL_VCOUNT = 21
30
+ HANGUL_TCOUNT = 28
31
+ HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
32
+ HANGUL_SCOUNT = 11172
33
+ HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
34
+ HANGUL_JAMO_FIRST = 0x1100
35
+ HANGUL_JAMO_LAST = 0x11FF
36
+
37
+ # All the unicode whitespace
38
+ WHITESPACE = [
39
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
40
+ 0x0020, # White_Space # Zs SPACE
41
+ 0x0085, # White_Space # Cc <control-0085>
42
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
43
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
44
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
45
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
46
+ 0x2028, # White_Space # Zl LINE SEPARATOR
47
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
48
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
49
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
50
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
51
+ ].flatten.freeze
52
+
53
+ # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
54
+ # between little and big endian. This is not an issue in utf-8, so it must be ignored.
55
+ LEADERS_AND_TRAILERS = WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
56
+
57
+ # Returns a regular expression pattern that matches the passed Unicode codepoints
58
+ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
59
+ array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
60
+ end
61
+ TRAILERS_PAT = /(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+\Z/u
62
+ LEADERS_PAT = /\A(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+/u
63
+
64
+ # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
65
+ # valid UTF-8.
66
+ #
67
+ # Example:
68
+ # Unicode.u_unpack('Café') # => [67, 97, 102, 233]
69
+ def u_unpack(string)
70
+ begin
71
+ string.unpack 'U*'
72
+ rescue ArgumentError
73
+ raise EncodingError, 'malformed UTF-8 character'
74
+ end
75
+ end
76
+
77
+ # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
78
+ # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
79
+ # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
80
+ #
81
+ # Primarily used by the grapheme cluster support.
82
+ def in_char_class?(codepoint, classes)
83
+ classes.detect { |c| database.boundary[c] === codepoint } ? true : false
84
+ end
85
+
86
+ # Unpack the string at grapheme boundaries. Returns a list of character lists.
87
+ #
88
+ # Example:
89
+ # Unicode.g_unpack('क्षि') # => [[2325, 2381], [2359], [2367]]
90
+ # Unicode.g_unpack('Café') # => [[67], [97], [102], [233]]
91
+ def g_unpack(string)
92
+ codepoints = u_unpack(string)
93
+ unpacked = []
94
+ pos = 0
95
+ marker = 0
96
+ eoc = codepoints.length
97
+ while(pos < eoc)
98
+ pos += 1
99
+ previous = codepoints[pos-1]
100
+ current = codepoints[pos]
101
+ if (
102
+ # CR X LF
103
+ ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or
104
+ # L X (L|V|LV|LVT)
105
+ ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
106
+ # (LV|V) X (V|T)
107
+ ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
108
+ # (LVT|T) X (T)
109
+ ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or
110
+ # X Extend
111
+ (database.boundary[:extend] === current)
112
+ )
113
+ else
114
+ unpacked << codepoints[marker..pos-1]
115
+ marker = pos
116
+ end
117
+ end
118
+ unpacked
119
+ end
120
+
121
+ # Reverse operation of g_unpack.
122
+ #
123
+ # Example:
124
+ # Unicode.g_pack(Unicode.g_unpack('क्षि')) # => 'क्षि'
125
+ def g_pack(unpacked)
126
+ (unpacked.flatten).pack('U*')
127
+ end
128
+
129
+ # Re-order codepoints so the string becomes canonical.
130
+ def reorder_characters(codepoints)
131
+ length = codepoints.length- 1
132
+ pos = 0
133
+ while pos < length do
134
+ cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]]
135
+ if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
136
+ codepoints[pos..pos+1] = cp2.code, cp1.code
137
+ pos += (pos > 0 ? -1 : 1)
138
+ else
139
+ pos += 1
140
+ end
141
+ end
142
+ codepoints
143
+ end
144
+
145
+ # Decompose composed characters to the decomposed form.
146
+ def decompose_codepoints(type, codepoints)
147
+ codepoints.inject([]) do |decomposed, cp|
148
+ # if it's a hangul syllable starter character
149
+ if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
150
+ sindex = cp - HANGUL_SBASE
151
+ ncp = [] # new codepoints
152
+ ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
153
+ ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
154
+ tindex = sindex % HANGUL_TCOUNT
155
+ ncp << (HANGUL_TBASE + tindex) unless tindex == 0
156
+ decomposed.concat ncp
157
+ # if the codepoint is decomposable in with the current decomposition type
158
+ elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability)
159
+ decomposed.concat decompose_codepoints(type, ncp.dup)
160
+ else
161
+ decomposed << cp
162
+ end
163
+ end
164
+ end
165
+
166
+ # Compose decomposed characters to the composed form.
167
+ def compose_codepoints(codepoints)
168
+ pos = 0
169
+ eoa = codepoints.length - 1
170
+ starter_pos = 0
171
+ starter_char = codepoints[0]
172
+ previous_combining_class = -1
173
+ while pos < eoa
174
+ pos += 1
175
+ lindex = starter_char - HANGUL_LBASE
176
+ # -- Hangul
177
+ if 0 <= lindex and lindex < HANGUL_LCOUNT
178
+ vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
179
+ if 0 <= vindex and vindex < HANGUL_VCOUNT
180
+ tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
181
+ if 0 <= tindex and tindex < HANGUL_TCOUNT
182
+ j = starter_pos + 2
183
+ eoa -= 2
184
+ else
185
+ tindex = 0
186
+ j = starter_pos + 1
187
+ eoa -= 1
188
+ end
189
+ codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
190
+ end
191
+ starter_pos += 1
192
+ starter_char = codepoints[starter_pos]
193
+ # -- Other characters
194
+ else
195
+ current_char = codepoints[pos]
196
+ current = database.codepoints[current_char]
197
+ if current.combining_class > previous_combining_class
198
+ if ref = database.composition_map[starter_char]
199
+ composition = ref[current_char]
200
+ else
201
+ composition = nil
202
+ end
203
+ unless composition.nil?
204
+ codepoints[starter_pos] = composition
205
+ starter_char = composition
206
+ codepoints.delete_at pos
207
+ eoa -= 1
208
+ pos -= 1
209
+ previous_combining_class = -1
210
+ else
211
+ previous_combining_class = current.combining_class
212
+ end
213
+ else
214
+ previous_combining_class = current.combining_class
215
+ end
216
+ if current.combining_class == 0
217
+ starter_pos = pos
218
+ starter_char = codepoints[pos]
219
+ end
220
+ end
221
+ end
222
+ codepoints
223
+ end
224
+
225
+ # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
226
+ #
227
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
228
+ def tidy_bytes(string, force = false)
229
+ if force
230
+ return string.unpack("C*").map do |b|
231
+ tidy_byte(b)
232
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
233
+ end
234
+
235
+ bytes = string.unpack("C*")
236
+ conts_expected = 0
237
+ last_lead = 0
238
+
239
+ bytes.each_index do |i|
240
+
241
+ byte = bytes[i]
242
+ is_cont = byte > 127 && byte < 192
243
+ is_lead = byte > 191 && byte < 245
244
+ is_unused = byte > 240
245
+ is_restricted = byte > 244
246
+
247
+ # Impossible or highly unlikely byte? Clean it.
248
+ if is_unused || is_restricted
249
+ bytes[i] = tidy_byte(byte)
250
+ elsif is_cont
251
+ # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
252
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
253
+ else
254
+ if conts_expected > 0
255
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
256
+ # the leading byte.
257
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
258
+ conts_expected = 0
259
+ end
260
+ if is_lead
261
+ # Final byte is leading? Clean it.
262
+ if i == bytes.length - 1
263
+ bytes[i] = tidy_byte(bytes.last)
264
+ else
265
+ # Valid leading byte? Expect continuations determined by position of
266
+ # first zero bit, with max of 3.
267
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
268
+ last_lead = i
269
+ end
270
+ end
271
+ end
272
+ end
273
+ bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
274
+ end
275
+
276
+ # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
277
+ # passing strings to databases and validations.
278
+ #
279
+ # * <tt>string</tt> - The string to perform normalization on.
280
+ # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
281
+ # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
282
+ # ActiveSupport::Multibyte.default_normalization_form
283
+ def normalize(string, form=nil)
284
+ form ||= @default_normalization_form
285
+ # See http://www.unicode.org/reports/tr15, Table 1
286
+ codepoints = u_unpack(string)
287
+ case form
288
+ when :d
289
+ reorder_characters(decompose_codepoints(:canonical, codepoints))
290
+ when :c
291
+ compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints)))
292
+ when :kd
293
+ reorder_characters(decompose_codepoints(:compatability, codepoints))
294
+ when :kc
295
+ compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints)))
296
+ else
297
+ raise ArgumentError, "#{form} is not a valid normalization variant", caller
298
+ end.pack('U*')
299
+ end
300
+
301
+ def apply_mapping(string, mapping) #:nodoc:
302
+ u_unpack(string).map do |codepoint|
303
+ cp = database.codepoints[codepoint]
304
+ if cp and (ncp = cp.send(mapping)) and ncp > 0
305
+ ncp
306
+ else
307
+ codepoint
308
+ end
309
+ end.pack('U*')
310
+ end
311
+
312
+ # Holds data about a codepoint in the Unicode database
313
+ class Codepoint
314
+ attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
315
+ end
316
+
317
+ # Holds static data from the Unicode database
318
+ class UnicodeDatabase
319
+ ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
320
+
321
+ attr_writer(*ATTRIBUTES)
322
+
323
+ def initialize
324
+ @codepoints = Hash.new(Codepoint.new)
325
+ @composition_exclusion = []
326
+ @composition_map = {}
327
+ @boundary = {}
328
+ @cp1252 = {}
329
+ end
330
+
331
+ # Lazy load the Unicode database so it's only loaded when it's actually used
332
+ ATTRIBUTES.each do |attr_name|
333
+ class_eval(<<-EOS, __FILE__, __LINE__ + 1)
334
+ def #{attr_name} # def codepoints
335
+ load # load
336
+ @#{attr_name} # @codepoints
337
+ end # end
338
+ EOS
339
+ end
340
+
341
+ # Loads the Unicode database and returns all the internal objects of UnicodeDatabase.
342
+ def load
343
+ begin
344
+ @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
345
+ rescue Exception => e
346
+ raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable")
347
+ end
348
+
349
+ # Redefine the === method so we can write shorter rules for grapheme cluster breaks
350
+ @boundary.each do |k,_|
351
+ @boundary[k].instance_eval do
352
+ def ===(other)
353
+ detect { |i| i === other } ? true : false
354
+ end
355
+ end if @boundary[k].kind_of?(Array)
356
+ end
357
+
358
+ # define attr_reader methods for the instance variables
359
+ class << self
360
+ attr_reader(*ATTRIBUTES)
361
+ end
362
+ end
363
+
364
+ # Returns the directory in which the data files are stored
365
+ def self.dirname
366
+ File.dirname(__FILE__) + '/../values/'
367
+ end
368
+
369
+ # Returns the filename for the data file for this version
370
+ def self.filename
371
+ File.expand_path File.join(dirname, "unicode_tables.dat")
372
+ end
373
+ end
374
+
375
+ private
376
+
377
+ def tidy_byte(byte)
378
+ if byte < 160
379
+ [database.cp1252[byte] || byte].pack("U").unpack("C*")
380
+ elsif byte < 192
381
+ [194, byte]
382
+ else
383
+ [195, byte - 64]
384
+ end
385
+ end
386
+
387
+ def database
388
+ @database ||= UnicodeDatabase.new
389
+ end
390
+
391
+ end
392
+ end
393
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+
3
+ module ActiveSupport #:nodoc:
4
+ module Multibyte #:nodoc:
5
+ if Kernel.const_defined?(:Encoding)
6
+ # Returns a regular expression that matches valid characters in the current encoding
7
+ def self.valid_character
8
+ VALID_CHARACTER[Encoding.default_external.to_s]
9
+ end
10
+ else
11
+ def self.valid_character
12
+ case $KCODE
13
+ when 'UTF8'
14
+ VALID_CHARACTER['UTF-8']
15
+ when 'SJIS'
16
+ VALID_CHARACTER['Shift_JIS']
17
+ end
18
+ end
19
+ end
20
+
21
+ if 'string'.respond_to?(:valid_encoding?)
22
+ # Verifies the encoding of a string
23
+ def self.verify(string)
24
+ string.valid_encoding?
25
+ end
26
+ else
27
+ def self.verify(string)
28
+ if expression = valid_character
29
+ # Splits the string on character boundaries, which are determined based on $KCODE.
30
+ string.split(//).all? { |c| expression =~ c }
31
+ else
32
+ true
33
+ end
34
+ end
35
+ end
36
+
37
+ # Verifies the encoding of the string and raises an exception when it's not valid
38
+ def self.verify!(string)
39
+ raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
40
+ end
41
+
42
+ if 'string'.respond_to?(:force_encoding)
43
+ # Removes all invalid characters from the string.
44
+ #
45
+ # Note: this method is a no-op in Ruby 1.9
46
+ def self.clean(string)
47
+ string
48
+ end
49
+ else
50
+ def self.clean(string)
51
+ if expression = valid_character
52
+ # Splits the string on character boundaries, which are determined based on $KCODE.
53
+ string.split(//).grep(expression).join
54
+ else
55
+ string
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end