multibyte 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/.git/COMMIT_EDITMSG +57 -0
  2. data/.git/HEAD +1 -0
  3. data/.git/config +8 -0
  4. data/.git/description +1 -0
  5. data/.git/hooks/applypatch-msg +15 -0
  6. data/.git/hooks/commit-msg +21 -0
  7. data/.git/hooks/post-commit +8 -0
  8. data/.git/hooks/post-receive +16 -0
  9. data/.git/hooks/post-update +8 -0
  10. data/.git/hooks/pre-applypatch +14 -0
  11. data/.git/hooks/pre-commit +70 -0
  12. data/.git/hooks/pre-rebase +150 -0
  13. data/.git/hooks/update +107 -0
  14. data/.git/index +0 -0
  15. data/.git/info/exclude +6 -0
  16. data/.git/logs/HEAD +4 -0
  17. data/.git/logs/refs/heads/master +4 -0
  18. data/.git/logs/refs/remotes/origin/master +3 -0
  19. data/.git/objects/02/e9a792fa31b00f0f06012e0b5e3ef327c40f64 +0 -0
  20. data/.git/objects/10/0b93820ade4c16225673b4ca62bb3ade63c313 +0 -0
  21. data/.git/objects/10/aab6490174970a2349d1050882c644f8d07006 +7 -0
  22. data/.git/objects/11/cda2eaac1ae103a13bf835456de05f53b0c8ad +0 -0
  23. data/.git/objects/1d/61c71cd5cf0f4dae6305b007a6177e1cecd7ea +0 -0
  24. data/.git/objects/2c/84cd00c156ba3b46e733c7e34dd9aea0c4fa75 +0 -0
  25. data/.git/objects/2f/3a16e4a756ec7a2d32c4bd8386b855f41cae61 +0 -0
  26. data/.git/objects/2f/437425b2c085bb529379e409d6b4bc7f9f9fba +0 -0
  27. data/.git/objects/3f/9afe5eaf00ad4d2ffa2e9315b96f530b0e85d4 +2 -0
  28. data/.git/objects/42/4a5f37c6fe3a7cac54b0f85688c1cce7da9cdf +0 -0
  29. data/.git/objects/47/26f03dc598a372cb959e3cbe33334de70802fd +0 -0
  30. data/.git/objects/55/37e6b2a449fe6d6c800803d0f72ffbcf44c297 +0 -0
  31. data/.git/objects/64/334b80d5fcead16f13cb8b7a45120ea460747b +0 -0
  32. data/.git/objects/69/1ed3b65603a0d0dabfb66720e7163664be78f5 +0 -0
  33. data/.git/objects/72/141584a5836168fe912ebce5b4537bd0eafc09 +1 -0
  34. data/.git/objects/7c/e872bc55ff766e542d2fbbe00e9dde70baaa92 +2 -0
  35. data/.git/objects/93/e03faa6906892434fa9f2ad230c4ececc64055 +0 -0
  36. data/.git/objects/9d/c9a787700bafb70e1833a760b73dec26a6a56e +0 -0
  37. data/.git/objects/9f/672edccfaaa00b6584f599c3e68560e6a5bedf +0 -0
  38. data/.git/objects/ab/1cd78b02ec268ce7d2faf4b1692f4bed9273fe +0 -0
  39. data/.git/objects/ad/8866bd33ae1afe9278cf5ee833d71977af561c +0 -0
  40. data/.git/objects/af/c3ea327fa220ee42a5ebecfcc7d218be4ea1d9 +0 -0
  41. data/.git/objects/b2/56d1cf3a9ba99f308daf1b4cd0dd08f11448e0 +1 -0
  42. data/.git/objects/bc/184b885eca1bed875171bd3990433055dda3af +0 -0
  43. data/.git/objects/c2/7f6559350f7adb19d43742b55b2f91d07b6550 +0 -0
  44. data/.git/objects/c3/e7364c6e90fb80e979779cd822bc8cdc8a3790 +0 -0
  45. data/.git/objects/c9/5128ff8f94e949d796afa128f2f341ba69989c +0 -0
  46. data/.git/objects/ce/e1f5b453e86a5aa913c626641608839bb79eef +1 -0
  47. data/.git/objects/cf/6add7ea568d3d90d6a1f8afb0898b0119b14ff +0 -0
  48. data/.git/objects/d5/e18b1fd7b189fdc75dd3bc4d208323cbc01aa8 +0 -0
  49. data/.git/objects/de/6e40dc4d92c1a889f8581718a48f2fcd219442 +0 -0
  50. data/.git/objects/df/42782d610e8c714f78cf0a018c600667da795e +0 -0
  51. data/.git/objects/e0/65ff01f0883142d4901a371be8dc60998fbe04 +4 -0
  52. data/.git/objects/e1/ad06969c3d3982b06ebd7da3777a0926501e7c +0 -0
  53. data/.git/objects/e4/69154c2ae4c3c596870a5e920f197b7b5be179 +2 -0
  54. data/.git/objects/e4/8464df56bf487e96e21ea99487330266dae3c9 +0 -0
  55. data/.git/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391 +0 -0
  56. data/.git/objects/e8/517350dbaadcb01282282218638c6022a20091 +0 -0
  57. data/.git/objects/eb/1bffe6adda4dc202d219cb35fe142a5e42b46b +0 -0
  58. data/.git/objects/eb/5d1c06d3a469df91dff68022b3b179e30cd127 +0 -0
  59. data/.git/objects/ec/f2f7e20cd14b0d4b5c0b7d0450aafc8c4f7066 +0 -0
  60. data/.git/objects/f1/ca0626d3e5dfcf88df23c9720d6f770fcbef73 +0 -0
  61. data/.git/objects/f2/3c73e1110a2fec82d37a1a399b9f274d30f0fe +0 -0
  62. data/.git/objects/f2/aec2820bf2f1c87c031d4abe262560392ac4b9 +0 -0
  63. data/.git/objects/f8/73dae66e0208669dbf74718181b3866647f426 +0 -0
  64. data/.git/refs/heads/master +1 -0
  65. data/.git/refs/remotes/origin/master +1 -0
  66. data/History.txt +4 -0
  67. data/License.txt +20 -0
  68. data/Manifest.txt +97 -0
  69. data/README.txt +19 -0
  70. data/Rakefile +4 -0
  71. data/config/hoe.rb +70 -0
  72. data/config/requirements.rb +17 -0
  73. data/lib/multibyte/chars.rb +135 -0
  74. data/lib/multibyte/generators/generate_tables.rb +145 -0
  75. data/lib/multibyte/handlers/passthru_handler.rb +9 -0
  76. data/lib/multibyte/handlers/utf8_handler.rb +564 -0
  77. data/lib/multibyte/handlers/utf8_handler_proc.rb +43 -0
  78. data/lib/multibyte/version.rb +9 -0
  79. data/lib/multibyte.rb +15 -0
  80. data/lib/unicode.rb +66 -0
  81. data/log/debug.log +0 -0
  82. data/script/destroy +14 -0
  83. data/script/generate +14 -0
  84. data/script/txt2html +74 -0
  85. data/setup.rb +1585 -0
  86. data/spec/multibyte_spec.rb +11 -0
  87. data/spec/spec.opts +1 -0
  88. data/spec/spec_helper.rb +7 -0
  89. data/tasks/deployment.rake +34 -0
  90. data/tasks/environment.rake +7 -0
  91. data/tasks/rspec.rake +21 -0
  92. data/tasks/website.rake +17 -0
  93. data/website/index.html +11 -0
  94. data/website/index.txt +39 -0
  95. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  96. data/website/stylesheets/screen.css +138 -0
  97. data/website/template.rhtml +48 -0
  98. data.tar.gz.sig +3 -0
  99. metadata +176 -0
  100. metadata.gz.sig +0 -0
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+ require 'open-uri'
3
+ require 'tmpdir'
4
+
5
+ module Multibyte::Handlers #:nodoc:
6
+ class UnicodeDatabase #:nodoc:
7
+ def self.load
8
+ [Hash.new(Codepoint.new),[],{},{}]
9
+ end
10
+ end
11
+
12
+ class UnicodeTableGenerator #:nodoc:
13
+ BASE_URI = "http://www.unicode.org/Public/#{Multibyte::UNICODE_VERSION}/ucd/"
14
+ SOURCES = {
15
+ :codepoints => BASE_URI + 'UnicodeData.txt',
16
+ :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
17
+ :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
18
+ :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
19
+ }
20
+
21
+ def initialize
22
+ @ucd = UnicodeDatabase.new
23
+
24
+ default = Codepoint.new
25
+ default.combining_class = 0
26
+ default.uppercase_mapping = 0
27
+ default.lowercase_mapping = 0
28
+ @ucd.codepoints = Hash.new(default)
29
+
30
+ @ucd.composition_exclusion = []
31
+ @ucd.composition_map = {}
32
+ @ucd.boundary = {}
33
+ @ucd.cp1252 = {}
34
+ end
35
+
36
+ def parse_codepoints(line)
37
+ codepoint = Codepoint.new
38
+ raise "Could not parse input." unless line =~ /^
39
+ ([0-9A-F]+); # code
40
+ ([^;]+); # name
41
+ ([A-Z]+); # general category
42
+ ([0-9]+); # canonical combining class
43
+ ([A-Z]+); # bidi class
44
+ (<([A-Z]*)>)? # decomposition type
45
+ ((\ ?[0-9A-F]+)*); # decompomposition mapping
46
+ ([0-9]*); # decimal digit
47
+ ([0-9]*); # digit
48
+ ([^;]*); # numeric
49
+ ([YN]*); # bidi mirrored
50
+ ([^;]*); # unicode 1.0 name
51
+ ([^;]*); # iso comment
52
+ ([0-9A-F]*); # simple uppercase mapping
53
+ ([0-9A-F]*); # simple lowercase mapping
54
+ ([0-9A-F]*)$/ix # simple titlecase mapping
55
+ codepoint.code = $1.hex
56
+ #codepoint.name = $2
57
+ #codepoint.category = $3
58
+ codepoint.combining_class = Integer($4)
59
+ #codepoint.bidi_class = $5
60
+ codepoint.decomp_type = $7
61
+ codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
62
+ #codepoint.bidi_mirrored = ($13=='Y') ? true : false
63
+ codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
64
+ codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
65
+ #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
66
+ @ucd.codepoints[codepoint.code] = codepoint
67
+ end
68
+
69
+ def parse_grapheme_break_property(line)
70
+ if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
71
+ type = $2.downcase.intern
72
+ @ucd.boundary[type] ||= []
73
+ if $1.include? '..'
74
+ parts = $1.split '..'
75
+ @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
76
+ else
77
+ @ucd.boundary[type] << $1.hex
78
+ end
79
+ end
80
+ end
81
+
82
+ def parse_composition_exclusion(line)
83
+ if line =~ /^([0-9A-F]+)/i
84
+ @ucd.composition_exclusion << $1.hex
85
+ end
86
+ end
87
+
88
+ def parse_cp1252(line)
89
+ if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
90
+ @ucd.cp1252[$1.hex] = $2.hex
91
+ end
92
+ end
93
+
94
+ def create_composition_map
95
+ @ucd.codepoints.each do |_, cp|
96
+ if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
97
+ @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
98
+ @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
99
+ end
100
+ end
101
+ end
102
+
103
+ def normalize_boundary_map
104
+ @ucd.boundary.each do |k,v|
105
+ if [:lf, :cr].include? k
106
+ @ucd.boundary[k] = v[0]
107
+ end
108
+ end
109
+ end
110
+
111
+ def parse
112
+ SOURCES.each do |type, url|
113
+ filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
114
+ unless File.exist?(filename)
115
+ $stderr.puts "Downloading #{url.split('/').last}"
116
+ File.open(filename, 'wb') do |target|
117
+ open(url) do |source|
118
+ source.each_line { |line| target.write line }
119
+ end
120
+ end
121
+ end
122
+ File.open(filename) do |file|
123
+ file.each_line { |line| send "parse_#{type}".intern, line }
124
+ end
125
+ end
126
+ create_composition_map
127
+ normalize_boundary_map
128
+ end
129
+
130
+ def dump_to(filename)
131
+ File.open(filename, 'wb') do |f|
132
+ f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
133
+ end
134
+ end
135
+ end
136
+ end
137
+
138
+ if __FILE__ == $0
139
+ filename = Multibyte::Handlers::UnicodeDatabase.filename
140
+ generator = Multibyte::Handlers::UnicodeTableGenerator.new
141
+ generator.parse
142
+ print "Writing to: #{filename}"
143
+ generator.dump_to filename
144
+ puts " (#{File.size(filename)} bytes)"
145
+ end
@@ -0,0 +1,9 @@
1
+ # Chars uses this handler when $KCODE is not set to 'UTF8'. Because this handler doesn't define any methods all call
2
+ # will be forwarded to String.
3
+ class Multibyte::Handlers::PassthruHandler #:nodoc:
4
+
5
+ # Return the original byteoffset
6
+ def self.translate_offset(string, byte_offset) #:nodoc:
7
+ byte_offset
8
+ end
9
+ end
@@ -0,0 +1,564 @@
1
+ # Contains all the handlers and helper classes
2
+ module Multibyte::Handlers #:nodoc:
3
+ class EncodingError < ArgumentError #:nodoc:
4
+ end
5
+
6
+ class Codepoint #:nodoc:
7
+ attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
8
+ end
9
+
10
+ class UnicodeDatabase #:nodoc:
11
+ attr_writer :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
12
+
13
+ # self-expiring methods that lazily load the Unicode database and then return the value.
14
+ [:codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252].each do |attr_name|
15
+ class_eval(<<-EOS, __FILE__, __LINE__)
16
+ def #{attr_name}
17
+ load
18
+ @#{attr_name}
19
+ end
20
+ EOS
21
+ end
22
+
23
+ # Shortcut to ucd.codepoints[]
24
+ def [](index); codepoints[index]; end
25
+
26
+ # Returns the directory in which the data files are stored
27
+ def self.dirname
28
+ File.dirname(__FILE__) + '/../../values/'
29
+ end
30
+
31
+ # Returns the filename for the data file for this version
32
+ def self.filename
33
+ File.expand_path File.join(dirname, "unicode_tables.dat")
34
+ end
35
+
36
+ # Loads the unicode database and returns all the internal objects of UnicodeDatabase
37
+ # Once the values have been loaded, define attr_reader methods for the instance variables.
38
+ def load
39
+ begin
40
+ @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
41
+ rescue Exception => e
42
+ raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable")
43
+ end
44
+ @codepoints ||= Hash.new(Codepoint.new)
45
+ @composition_exclusion ||= []
46
+ @composition_map ||= {}
47
+ @boundary ||= {}
48
+ @cp1252 ||= {}
49
+
50
+ # Redefine the === method so we can write shorter rules for grapheme cluster breaks
51
+ @boundary.each do |k,_|
52
+ @boundary[k].instance_eval do
53
+ def ===(other)
54
+ detect { |i| i === other } ? true : false
55
+ end
56
+ end if @boundary[k].kind_of?(Array)
57
+ end
58
+
59
+ # define attr_reader methods for the instance variables
60
+ class << self
61
+ attr_reader :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
62
+ end
63
+ end
64
+ end
65
+
66
+ # UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars
67
+ # proxy when $KCODE is set to 'UTF8'.
68
+ class UTF8Handler
69
+ # Hangul character boundaries and properties
70
+ HANGUL_SBASE = 0xAC00
71
+ HANGUL_LBASE = 0x1100
72
+ HANGUL_VBASE = 0x1161
73
+ HANGUL_TBASE = 0x11A7
74
+ HANGUL_LCOUNT = 19
75
+ HANGUL_VCOUNT = 21
76
+ HANGUL_TCOUNT = 28
77
+ HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
78
+ HANGUL_SCOUNT = 11172
79
+ HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
80
+ HANGUL_JAMO_FIRST = 0x1100
81
+ HANGUL_JAMO_LAST = 0x11FF
82
+
83
+ # All the unicode whitespace
84
+ UNICODE_WHITESPACE = [
85
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
86
+ 0x0020, # White_Space # Zs SPACE
87
+ 0x0085, # White_Space # Cc <control-0085>
88
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
89
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
90
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
91
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
92
+ 0x2028, # White_Space # Zl LINE SEPARATOR
93
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
94
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
95
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
96
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
97
+ ].flatten.freeze
98
+
99
+ # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
100
+ # between little and big endian. This is not an issue in utf-8, so it must be ignored.
101
+ UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
102
+
103
+ # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
104
+ UTF8_PAT = /\A(?:
105
+ [\x00-\x7f] |
106
+ [\xc2-\xdf] [\x80-\xbf] |
107
+ \xe0 [\xa0-\xbf] [\x80-\xbf] |
108
+ [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
109
+ \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
110
+ [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
111
+ \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
112
+ )*\z/xn
113
+
114
+ # Returns a regular expression pattern that matches the passed Unicode codepoints
115
+ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
116
+ array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
117
+ end
118
+ UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
119
+ UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
120
+
121
+ class << self
122
+
123
+ # ///
124
+ # /// BEGIN String method overrides
125
+ # ///
126
+
127
+ # Inserts the passed string at specified codepoint offsets
128
+ def insert(str, offset, fragment)
129
+ str.replace(
130
+ u_unpack(str).insert(
131
+ offset,
132
+ u_unpack(fragment)
133
+ ).flatten.pack('U*')
134
+ )
135
+ end
136
+
137
+ # Returns the position of the passed argument in the string, counting in codepoints
138
+ def index(str, *args)
139
+ bidx = str.index(*args)
140
+ bidx ? (u_unpack(str.slice(0...bidx)).size) : nil
141
+ end
142
+
143
+ # Works just like the indexed replace method on string, except instead of byte offsets you specify
144
+ # character offsets.
145
+ #
146
+ # Example:
147
+ #
148
+ # s = "Müller"
149
+ # s.chars[2] = "e" # Replace character with offset 2
150
+ # s # => "Müeler"
151
+ #
152
+ # s = "Müller"
153
+ # s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1
154
+ # s # => "Möler"
155
+ def []=(str, *args)
156
+ replace_by = args.pop
157
+ # Indexed replace with regular expressions already works
158
+ return str[*args] = replace_by if args.first.is_a?(Regexp)
159
+ result = u_unpack(str)
160
+ if args[0].is_a?(Fixnum)
161
+ raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
162
+ min = args[0]
163
+ max = args[1].nil? ? min : (min + args[1] - 1)
164
+ range = Range.new(min, max)
165
+ replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
166
+ elsif args.first.is_a?(Range)
167
+ raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
168
+ range = args[0]
169
+ else
170
+ needle = args[0].to_s
171
+ min = index(str, needle)
172
+ max = min + length(needle) - 1
173
+ range = Range.new(min, max)
174
+ end
175
+ result[range] = u_unpack(replace_by)
176
+ str.replace(result.pack('U*'))
177
+ end
178
+
179
+ # Works just like String#rjust, only integer specifies characters instead of bytes.
180
+ #
181
+ # Example:
182
+ #
183
+ # "¾ cup".chars.rjust(8).to_s
184
+ # # => " ¾ cup"
185
+ #
186
+ # "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
187
+ # # => "   ¾ cup"
188
+ def rjust(str, integer, padstr=' ')
189
+ justify(str, integer, :right, padstr)
190
+ end
191
+
192
+ # Works just like String#ljust, only integer specifies characters instead of bytes.
193
+ #
194
+ # Example:
195
+ #
196
+ # "¾ cup".chars.rjust(8).to_s
197
+ # # => "¾ cup "
198
+ #
199
+ # "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
200
+ # # => "¾ cup   "
201
+ def ljust(str, integer, padstr=' ')
202
+ justify(str, integer, :left, padstr)
203
+ end
204
+
205
+ # Works just like String#center, only integer specifies characters instead of bytes.
206
+ #
207
+ # Example:
208
+ #
209
+ # "¾ cup".chars.center(8).to_s
210
+ # # => " ¾ cup "
211
+ #
212
+ # "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace
213
+ # # => " ¾ cup  "
214
+ def center(str, integer, padstr=' ')
215
+ justify(str, integer, :center, padstr)
216
+ end
217
+
218
+ # Does Unicode-aware rstrip
219
+ def rstrip(str)
220
+ str.gsub(UNICODE_TRAILERS_PAT, '')
221
+ end
222
+
223
+ # Does Unicode-aware lstrip
224
+ def lstrip(str)
225
+ str.gsub(UNICODE_LEADERS_PAT, '')
226
+ end
227
+
228
+ # Removed leading and trailing whitespace
229
+ def strip(str)
230
+ str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '')
231
+ end
232
+
233
+ # Returns the number of codepoints in the string
234
+ def size(str)
235
+ u_unpack(str).size
236
+ end
237
+ alias_method :length, :size
238
+
239
+ # Reverses codepoints in the string.
240
+ def reverse(str)
241
+ u_unpack(str).reverse.pack('U*')
242
+ end
243
+
244
+ # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
245
+ # character.
246
+ def slice(str, *args)
247
+ if args.size > 2
248
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
249
+ elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
250
+ raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
251
+ elsif (args.size == 2 && !args[1].is_a?(Numeric))
252
+ raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
253
+ elsif args[0].kind_of? Range
254
+ cps = u_unpack(str).slice(*args)
255
+ cps.nil? ? nil : cps.pack('U*')
256
+ elsif args[0].kind_of? Regexp
257
+ str.slice(*args)
258
+ elsif args.size == 1 && args[0].kind_of?(Numeric)
259
+ u_unpack(str)[args[0]]
260
+ else
261
+ u_unpack(str).slice(*args).pack('U*')
262
+ end
263
+ end
264
+ alias_method :[], :slice
265
+
266
+ # Convert characters in the string to uppercase
267
+ def upcase(str); to_case :uppercase_mapping, str; end
268
+
269
+ # Convert characters in the string to lowercase
270
+ def downcase(str); to_case :lowercase_mapping, str; end
271
+
272
+ # Returns a copy of +str+ with the first character converted to uppercase and the remainder to lowercase
273
+ def capitalize(str)
274
+ upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '')
275
+ end
276
+
277
+ # ///
278
+ # /// Extra String methods for unicode operations
279
+ # ///
280
+
281
+ # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
282
+ # passing strings to databases and validations.
283
+ #
284
+ # * <tt>str</tt> - The string to perform normalization on.
285
+ # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
286
+ # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
287
+ # ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM.
288
+ def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM)
289
+ # See http://www.unicode.org/reports/tr15, Table 1
290
+ codepoints = u_unpack(str)
291
+ case form
292
+ when :d
293
+ reorder_characters(decompose_codepoints(:canonical, codepoints))
294
+ when :c
295
+ compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints))
296
+ when :kd
297
+ reorder_characters(decompose_codepoints(:compatability, codepoints))
298
+ when :kc
299
+ compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints))
300
+ else
301
+ raise ArgumentError, "#{form} is not a valid normalization variant", caller
302
+ end.pack('U*')
303
+ end
304
+
305
+ # Perform decomposition on the characters in the string
306
+ def decompose(str)
307
+ decompose_codepoints(:canonical, u_unpack(str)).pack('U*')
308
+ end
309
+
310
+ # Perform composition on the characters in the string
311
+ def compose(str)
312
+ compose_codepoints u_unpack(str).pack('U*')
313
+ end
314
+
315
+ # ///
316
+ # /// BEGIN Helper methods for unicode operation
317
+ # ///
318
+
319
+ # Used to translate an offset from bytes to characters, for instance one received from a regular expression match
320
+ def translate_offset(str, byte_offset)
321
+ return nil if byte_offset.nil?
322
+ return 0 if str == ''
323
+ chunk = str[0..byte_offset]
324
+ begin
325
+ begin
326
+ chunk.unpack('U*').length - 1
327
+ rescue ArgumentError => e
328
+ chunk = str[0..(byte_offset+=1)]
329
+ # Stop retrying at the end of the string
330
+ raise e unless byte_offset < chunk.length
331
+ # We damaged a character, retry
332
+ retry
333
+ end
334
+ # Catch the ArgumentError so we can throw our own
335
+ rescue ArgumentError
336
+ raise EncodingError.new('malformed UTF-8 character')
337
+ end
338
+ end
339
+
340
+ # Checks if the string is valid UTF8.
341
+ def consumes?(str)
342
+ # Unpack is a little bit faster than regular expressions
343
+ begin
344
+ str.unpack('U*')
345
+ true
346
+ rescue ArgumentError
347
+ false
348
+ end
349
+ end
350
+
351
+ # Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed
352
+ # in future versions.
353
+ def g_length(str)
354
+ g_unpack(str).length
355
+ end
356
+
357
+ # Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
358
+ def tidy_bytes(str)
359
+ str.split(//u).map do |c|
360
+ if !UTF8_PAT.match(c)
361
+ n = c.unpack('C')[0]
362
+ n < 128 ? n.chr :
363
+ n < 160 ? [UCD.cp1252[n] || n].pack('U') :
364
+ n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
365
+ else
366
+ c
367
+ end
368
+ end.join
369
+ end
370
+
371
+ protected
372
+
373
+ # Detect whether the codepoint is in a certain character class. Primarily used by the
374
+ # grapheme cluster support.
375
+ def in_char_class?(codepoint, classes)
376
+ classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
377
+ end
378
+
379
+ # Unpack the string at codepoints boundaries
380
+ def u_unpack(str)
381
+ begin
382
+ str.unpack 'U*'
383
+ rescue ArgumentError
384
+ raise EncodingError.new('malformed UTF-8 character')
385
+ end
386
+ end
387
+
388
+ # Unpack the string at grapheme boundaries instead of codepoint boundaries
389
+ def g_unpack(str)
390
+ codepoints = u_unpack(str)
391
+ unpacked = []
392
+ pos = 0
393
+ marker = 0
394
+ eoc = codepoints.length
395
+ while(pos < eoc)
396
+ pos += 1
397
+ previous = codepoints[pos-1]
398
+ current = codepoints[pos]
399
+ if (
400
+ # CR X LF
401
+ one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
402
+ # L X (L|V|LV|LVT)
403
+ two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
404
+ # (LV|V) X (V|T)
405
+ three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
406
+ # (LVT|T) X (T)
407
+ four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
408
+ # X Extend
409
+ five = (UCD.boundary[:extend] === current)
410
+ )
411
+ else
412
+ unpacked << codepoints[marker..pos-1]
413
+ marker = pos
414
+ end
415
+ end
416
+ unpacked
417
+ end
418
+
419
+ # Reverse operation of g_unpack
420
+ def g_pack(unpacked)
421
+ unpacked.flatten
422
+ end
423
+
424
+ # Justifies a string in a certain way. Valid values for <tt>way</tt> are <tt>:right</tt>, <tt>:left</tt> and
425
+ # <tt>:center</tt>. Is primarily used as a helper method by <tt>rjust</tt>, <tt>ljust</tt> and <tt>center</tt>.
426
+ def justify(str, integer, way, padstr=' ')
427
+ raise ArgumentError, "zero width padding" if padstr.length == 0
428
+ padsize = integer - size(str)
429
+ padsize = padsize > 0 ? padsize : 0
430
+ case way
431
+ when :right
432
+ str.dup.insert(0, padding(padsize, padstr))
433
+ when :left
434
+ str.dup.insert(-1, padding(padsize, padstr))
435
+ when :center
436
+ lpad = padding((padsize / 2.0).floor, padstr)
437
+ rpad = padding((padsize / 2.0).ceil, padstr)
438
+ str.dup.insert(0, lpad).insert(-1, rpad)
439
+ end
440
+ end
441
+
442
+ # Generates a padding string of a certain size.
443
+ def padding(padsize, padstr=' ')
444
+ if padsize != 0
445
+ slice(padstr * ((padsize / size(padstr)) + 1), 0, padsize)
446
+ else
447
+ ''
448
+ end
449
+ end
450
+
451
+ # Convert characters to a different case
452
+ def to_case(way, str)
453
+ u_unpack(str).map do |codepoint|
454
+ cp = UCD[codepoint]
455
+ unless cp.nil?
456
+ ncp = cp.send(way)
457
+ ncp > 0 ? ncp : codepoint
458
+ else
459
+ codepoint
460
+ end
461
+ end.pack('U*')
462
+ end
463
+
464
+ # Re-order codepoints so the string becomes canonical
465
+ def reorder_characters(codepoints)
466
+ length = codepoints.length- 1
467
+ pos = 0
468
+ while pos < length do
469
+ cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]]
470
+ if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
471
+ codepoints[pos..pos+1] = cp2.code, cp1.code
472
+ pos += (pos > 0 ? -1 : 1)
473
+ else
474
+ pos += 1
475
+ end
476
+ end
477
+ codepoints
478
+ end
479
+
480
+ # Decompose composed characters to the decomposed form
481
+ def decompose_codepoints(type, codepoints)
482
+ codepoints.inject([]) do |decomposed, cp|
483
+ # if it's a hangul syllable starter character
484
+ if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
485
+ sindex = cp - HANGUL_SBASE
486
+ ncp = [] # new codepoints
487
+ ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
488
+ ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
489
+ tindex = sindex % HANGUL_TCOUNT
490
+ ncp << (HANGUL_TBASE + tindex) unless tindex == 0
491
+ decomposed.concat ncp
492
+ # if the codepoint is decomposable in with the current decomposition type
493
+ elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability)
494
+ decomposed.concat decompose_codepoints(type, ncp.dup)
495
+ else
496
+ decomposed << cp
497
+ end
498
+ end
499
+ end
500
+
501
+ # Compose decomposed characters to the composed form
502
+ def compose_codepoints(codepoints)
503
+ pos = 0
504
+ eoa = codepoints.length - 1
505
+ starter_pos = 0
506
+ starter_char = codepoints[0]
507
+ previous_combining_class = -1
508
+ while pos < eoa
509
+ pos += 1
510
+ lindex = starter_char - HANGUL_LBASE
511
+ # -- Hangul
512
+ if 0 <= lindex and lindex < HANGUL_LCOUNT
513
+ vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
514
+ if 0 <= vindex and vindex < HANGUL_VCOUNT
515
+ tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
516
+ if 0 <= tindex and tindex < HANGUL_TCOUNT
517
+ j = starter_pos + 2
518
+ eoa -= 2
519
+ else
520
+ tindex = 0
521
+ j = starter_pos + 1
522
+ eoa -= 1
523
+ end
524
+ codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
525
+ end
526
+ starter_pos += 1
527
+ starter_char = codepoints[starter_pos]
528
+ # -- Other characters
529
+ else
530
+ current_char = codepoints[pos]
531
+ current = UCD[current_char]
532
+ if current.combining_class > previous_combining_class
533
+ if ref = UCD.composition_map[starter_char]
534
+ composition = ref[current_char]
535
+ else
536
+ composition = nil
537
+ end
538
+ unless composition.nil?
539
+ codepoints[starter_pos] = composition
540
+ starter_char = composition
541
+ codepoints.delete_at pos
542
+ eoa -= 1
543
+ pos -= 1
544
+ previous_combining_class = -1
545
+ else
546
+ previous_combining_class = current.combining_class
547
+ end
548
+ else
549
+ previous_combining_class = current.combining_class
550
+ end
551
+ if current.combining_class == 0
552
+ starter_pos = pos
553
+ starter_char = codepoints[pos]
554
+ end
555
+ end
556
+ end
557
+ codepoints
558
+ end
559
+
560
+ # UniCode Database
561
+ UCD = UnicodeDatabase.new
562
+ end
563
+ end
564
+ end