multibyte 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. data/.git/COMMIT_EDITMSG +57 -0
  2. data/.git/HEAD +1 -0
  3. data/.git/config +8 -0
  4. data/.git/description +1 -0
  5. data/.git/hooks/applypatch-msg +15 -0
  6. data/.git/hooks/commit-msg +21 -0
  7. data/.git/hooks/post-commit +8 -0
  8. data/.git/hooks/post-receive +16 -0
  9. data/.git/hooks/post-update +8 -0
  10. data/.git/hooks/pre-applypatch +14 -0
  11. data/.git/hooks/pre-commit +70 -0
  12. data/.git/hooks/pre-rebase +150 -0
  13. data/.git/hooks/update +107 -0
  14. data/.git/index +0 -0
  15. data/.git/info/exclude +6 -0
  16. data/.git/logs/HEAD +4 -0
  17. data/.git/logs/refs/heads/master +4 -0
  18. data/.git/logs/refs/remotes/origin/master +3 -0
  19. data/.git/objects/02/e9a792fa31b00f0f06012e0b5e3ef327c40f64 +0 -0
  20. data/.git/objects/10/0b93820ade4c16225673b4ca62bb3ade63c313 +0 -0
  21. data/.git/objects/10/aab6490174970a2349d1050882c644f8d07006 +7 -0
  22. data/.git/objects/11/cda2eaac1ae103a13bf835456de05f53b0c8ad +0 -0
  23. data/.git/objects/1d/61c71cd5cf0f4dae6305b007a6177e1cecd7ea +0 -0
  24. data/.git/objects/2c/84cd00c156ba3b46e733c7e34dd9aea0c4fa75 +0 -0
  25. data/.git/objects/2f/3a16e4a756ec7a2d32c4bd8386b855f41cae61 +0 -0
  26. data/.git/objects/2f/437425b2c085bb529379e409d6b4bc7f9f9fba +0 -0
  27. data/.git/objects/3f/9afe5eaf00ad4d2ffa2e9315b96f530b0e85d4 +2 -0
  28. data/.git/objects/42/4a5f37c6fe3a7cac54b0f85688c1cce7da9cdf +0 -0
  29. data/.git/objects/47/26f03dc598a372cb959e3cbe33334de70802fd +0 -0
  30. data/.git/objects/55/37e6b2a449fe6d6c800803d0f72ffbcf44c297 +0 -0
  31. data/.git/objects/64/334b80d5fcead16f13cb8b7a45120ea460747b +0 -0
  32. data/.git/objects/69/1ed3b65603a0d0dabfb66720e7163664be78f5 +0 -0
  33. data/.git/objects/72/141584a5836168fe912ebce5b4537bd0eafc09 +1 -0
  34. data/.git/objects/7c/e872bc55ff766e542d2fbbe00e9dde70baaa92 +2 -0
  35. data/.git/objects/93/e03faa6906892434fa9f2ad230c4ececc64055 +0 -0
  36. data/.git/objects/9d/c9a787700bafb70e1833a760b73dec26a6a56e +0 -0
  37. data/.git/objects/9f/672edccfaaa00b6584f599c3e68560e6a5bedf +0 -0
  38. data/.git/objects/ab/1cd78b02ec268ce7d2faf4b1692f4bed9273fe +0 -0
  39. data/.git/objects/ad/8866bd33ae1afe9278cf5ee833d71977af561c +0 -0
  40. data/.git/objects/af/c3ea327fa220ee42a5ebecfcc7d218be4ea1d9 +0 -0
  41. data/.git/objects/b2/56d1cf3a9ba99f308daf1b4cd0dd08f11448e0 +1 -0
  42. data/.git/objects/bc/184b885eca1bed875171bd3990433055dda3af +0 -0
  43. data/.git/objects/c2/7f6559350f7adb19d43742b55b2f91d07b6550 +0 -0
  44. data/.git/objects/c3/e7364c6e90fb80e979779cd822bc8cdc8a3790 +0 -0
  45. data/.git/objects/c9/5128ff8f94e949d796afa128f2f341ba69989c +0 -0
  46. data/.git/objects/ce/e1f5b453e86a5aa913c626641608839bb79eef +1 -0
  47. data/.git/objects/cf/6add7ea568d3d90d6a1f8afb0898b0119b14ff +0 -0
  48. data/.git/objects/d5/e18b1fd7b189fdc75dd3bc4d208323cbc01aa8 +0 -0
  49. data/.git/objects/de/6e40dc4d92c1a889f8581718a48f2fcd219442 +0 -0
  50. data/.git/objects/df/42782d610e8c714f78cf0a018c600667da795e +0 -0
  51. data/.git/objects/e0/65ff01f0883142d4901a371be8dc60998fbe04 +4 -0
  52. data/.git/objects/e1/ad06969c3d3982b06ebd7da3777a0926501e7c +0 -0
  53. data/.git/objects/e4/69154c2ae4c3c596870a5e920f197b7b5be179 +2 -0
  54. data/.git/objects/e4/8464df56bf487e96e21ea99487330266dae3c9 +0 -0
  55. data/.git/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391 +0 -0
  56. data/.git/objects/e8/517350dbaadcb01282282218638c6022a20091 +0 -0
  57. data/.git/objects/eb/1bffe6adda4dc202d219cb35fe142a5e42b46b +0 -0
  58. data/.git/objects/eb/5d1c06d3a469df91dff68022b3b179e30cd127 +0 -0
  59. data/.git/objects/ec/f2f7e20cd14b0d4b5c0b7d0450aafc8c4f7066 +0 -0
  60. data/.git/objects/f1/ca0626d3e5dfcf88df23c9720d6f770fcbef73 +0 -0
  61. data/.git/objects/f2/3c73e1110a2fec82d37a1a399b9f274d30f0fe +0 -0
  62. data/.git/objects/f2/aec2820bf2f1c87c031d4abe262560392ac4b9 +0 -0
  63. data/.git/objects/f8/73dae66e0208669dbf74718181b3866647f426 +0 -0
  64. data/.git/refs/heads/master +1 -0
  65. data/.git/refs/remotes/origin/master +1 -0
  66. data/History.txt +4 -0
  67. data/License.txt +20 -0
  68. data/Manifest.txt +97 -0
  69. data/README.txt +19 -0
  70. data/Rakefile +4 -0
  71. data/config/hoe.rb +70 -0
  72. data/config/requirements.rb +17 -0
  73. data/lib/multibyte/chars.rb +135 -0
  74. data/lib/multibyte/generators/generate_tables.rb +145 -0
  75. data/lib/multibyte/handlers/passthru_handler.rb +9 -0
  76. data/lib/multibyte/handlers/utf8_handler.rb +564 -0
  77. data/lib/multibyte/handlers/utf8_handler_proc.rb +43 -0
  78. data/lib/multibyte/version.rb +9 -0
  79. data/lib/multibyte.rb +15 -0
  80. data/lib/unicode.rb +66 -0
  81. data/log/debug.log +0 -0
  82. data/script/destroy +14 -0
  83. data/script/generate +14 -0
  84. data/script/txt2html +74 -0
  85. data/setup.rb +1585 -0
  86. data/spec/multibyte_spec.rb +11 -0
  87. data/spec/spec.opts +1 -0
  88. data/spec/spec_helper.rb +7 -0
  89. data/tasks/deployment.rake +34 -0
  90. data/tasks/environment.rake +7 -0
  91. data/tasks/rspec.rake +21 -0
  92. data/tasks/website.rake +17 -0
  93. data/website/index.html +11 -0
  94. data/website/index.txt +39 -0
  95. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  96. data/website/stylesheets/screen.css +138 -0
  97. data/website/template.rhtml +48 -0
  98. data.tar.gz.sig +3 -0
  99. metadata +176 -0
  100. metadata.gz.sig +0 -0
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/env ruby
2
+ require 'open-uri'
3
+ require 'tmpdir'
4
+
5
+ module Multibyte::Handlers #:nodoc:
6
+ class UnicodeDatabase #:nodoc:
7
+ def self.load
8
+ [Hash.new(Codepoint.new),[],{},{}]
9
+ end
10
+ end
11
+
12
+ class UnicodeTableGenerator #:nodoc:
13
+ BASE_URI = "http://www.unicode.org/Public/#{Multibyte::UNICODE_VERSION}/ucd/"
14
+ SOURCES = {
15
+ :codepoints => BASE_URI + 'UnicodeData.txt',
16
+ :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
17
+ :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
18
+ :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
19
+ }
20
+
21
+ def initialize
22
+ @ucd = UnicodeDatabase.new
23
+
24
+ default = Codepoint.new
25
+ default.combining_class = 0
26
+ default.uppercase_mapping = 0
27
+ default.lowercase_mapping = 0
28
+ @ucd.codepoints = Hash.new(default)
29
+
30
+ @ucd.composition_exclusion = []
31
+ @ucd.composition_map = {}
32
+ @ucd.boundary = {}
33
+ @ucd.cp1252 = {}
34
+ end
35
+
36
+ def parse_codepoints(line)
37
+ codepoint = Codepoint.new
38
+ raise "Could not parse input." unless line =~ /^
39
+ ([0-9A-F]+); # code
40
+ ([^;]+); # name
41
+ ([A-Z]+); # general category
42
+ ([0-9]+); # canonical combining class
43
+ ([A-Z]+); # bidi class
44
+ (<([A-Z]*)>)? # decomposition type
45
+ ((\ ?[0-9A-F]+)*); # decompomposition mapping
46
+ ([0-9]*); # decimal digit
47
+ ([0-9]*); # digit
48
+ ([^;]*); # numeric
49
+ ([YN]*); # bidi mirrored
50
+ ([^;]*); # unicode 1.0 name
51
+ ([^;]*); # iso comment
52
+ ([0-9A-F]*); # simple uppercase mapping
53
+ ([0-9A-F]*); # simple lowercase mapping
54
+ ([0-9A-F]*)$/ix # simple titlecase mapping
55
+ codepoint.code = $1.hex
56
+ #codepoint.name = $2
57
+ #codepoint.category = $3
58
+ codepoint.combining_class = Integer($4)
59
+ #codepoint.bidi_class = $5
60
+ codepoint.decomp_type = $7
61
+ codepoint.decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex }
62
+ #codepoint.bidi_mirrored = ($13=='Y') ? true : false
63
+ codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
64
+ codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
65
+ #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
66
+ @ucd.codepoints[codepoint.code] = codepoint
67
+ end
68
+
69
+ def parse_grapheme_break_property(line)
70
+ if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
71
+ type = $2.downcase.intern
72
+ @ucd.boundary[type] ||= []
73
+ if $1.include? '..'
74
+ parts = $1.split '..'
75
+ @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
76
+ else
77
+ @ucd.boundary[type] << $1.hex
78
+ end
79
+ end
80
+ end
81
+
82
+ def parse_composition_exclusion(line)
83
+ if line =~ /^([0-9A-F]+)/i
84
+ @ucd.composition_exclusion << $1.hex
85
+ end
86
+ end
87
+
88
+ def parse_cp1252(line)
89
+ if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
90
+ @ucd.cp1252[$1.hex] = $2.hex
91
+ end
92
+ end
93
+
94
+ def create_composition_map
95
+ @ucd.codepoints.each do |_, cp|
96
+ if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
97
+ @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
98
+ @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
99
+ end
100
+ end
101
+ end
102
+
103
+ def normalize_boundary_map
104
+ @ucd.boundary.each do |k,v|
105
+ if [:lf, :cr].include? k
106
+ @ucd.boundary[k] = v[0]
107
+ end
108
+ end
109
+ end
110
+
111
+ def parse
112
+ SOURCES.each do |type, url|
113
+ filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
114
+ unless File.exist?(filename)
115
+ $stderr.puts "Downloading #{url.split('/').last}"
116
+ File.open(filename, 'wb') do |target|
117
+ open(url) do |source|
118
+ source.each_line { |line| target.write line }
119
+ end
120
+ end
121
+ end
122
+ File.open(filename) do |file|
123
+ file.each_line { |line| send "parse_#{type}".intern, line }
124
+ end
125
+ end
126
+ create_composition_map
127
+ normalize_boundary_map
128
+ end
129
+
130
+ def dump_to(filename)
131
+ File.open(filename, 'wb') do |f|
132
+ f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
133
+ end
134
+ end
135
+ end
136
+ end
137
+
138
+ if __FILE__ == $0
139
+ filename = Multibyte::Handlers::UnicodeDatabase.filename
140
+ generator = Multibyte::Handlers::UnicodeTableGenerator.new
141
+ generator.parse
142
+ print "Writing to: #{filename}"
143
+ generator.dump_to filename
144
+ puts " (#{File.size(filename)} bytes)"
145
+ end
@@ -0,0 +1,9 @@
1
+ # Chars uses this handler when $KCODE is not set to 'UTF8'. Because this handler doesn't define any methods all call
2
+ # will be forwarded to String.
3
+ class Multibyte::Handlers::PassthruHandler #:nodoc:
4
+
5
+ # Return the original byteoffset
6
+ def self.translate_offset(string, byte_offset) #:nodoc:
7
+ byte_offset
8
+ end
9
+ end
@@ -0,0 +1,564 @@
1
+ # Contains all the handlers and helper classes
2
+ module Multibyte::Handlers #:nodoc:
3
+ class EncodingError < ArgumentError #:nodoc:
4
+ end
5
+
6
+ class Codepoint #:nodoc:
7
+ attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
8
+ end
9
+
10
+ class UnicodeDatabase #:nodoc:
11
+ attr_writer :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
12
+
13
+ # self-expiring methods that lazily load the Unicode database and then return the value.
14
+ [:codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252].each do |attr_name|
15
+ class_eval(<<-EOS, __FILE__, __LINE__)
16
+ def #{attr_name}
17
+ load
18
+ @#{attr_name}
19
+ end
20
+ EOS
21
+ end
22
+
23
+ # Shortcut to ucd.codepoints[]
24
+ def [](index); codepoints[index]; end
25
+
26
+ # Returns the directory in which the data files are stored
27
+ def self.dirname
28
+ File.dirname(__FILE__) + '/../../values/'
29
+ end
30
+
31
+ # Returns the filename for the data file for this version
32
+ def self.filename
33
+ File.expand_path File.join(dirname, "unicode_tables.dat")
34
+ end
35
+
36
+ # Loads the unicode database and returns all the internal objects of UnicodeDatabase
37
+ # Once the values have been loaded, define attr_reader methods for the instance variables.
38
+ def load
39
+ begin
40
+ @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
41
+ rescue Exception => e
42
+ raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable")
43
+ end
44
+ @codepoints ||= Hash.new(Codepoint.new)
45
+ @composition_exclusion ||= []
46
+ @composition_map ||= {}
47
+ @boundary ||= {}
48
+ @cp1252 ||= {}
49
+
50
+ # Redefine the === method so we can write shorter rules for grapheme cluster breaks
51
+ @boundary.each do |k,_|
52
+ @boundary[k].instance_eval do
53
+ def ===(other)
54
+ detect { |i| i === other } ? true : false
55
+ end
56
+ end if @boundary[k].kind_of?(Array)
57
+ end
58
+
59
+ # define attr_reader methods for the instance variables
60
+ class << self
61
+ attr_reader :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
62
+ end
63
+ end
64
+ end
65
+
66
+ # UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars
67
+ # proxy when $KCODE is set to 'UTF8'.
68
+ class UTF8Handler
69
+ # Hangul character boundaries and properties
70
+ HANGUL_SBASE = 0xAC00
71
+ HANGUL_LBASE = 0x1100
72
+ HANGUL_VBASE = 0x1161
73
+ HANGUL_TBASE = 0x11A7
74
+ HANGUL_LCOUNT = 19
75
+ HANGUL_VCOUNT = 21
76
+ HANGUL_TCOUNT = 28
77
+ HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
78
+ HANGUL_SCOUNT = 11172
79
+ HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
80
+ HANGUL_JAMO_FIRST = 0x1100
81
+ HANGUL_JAMO_LAST = 0x11FF
82
+
83
+ # All the unicode whitespace
84
+ UNICODE_WHITESPACE = [
85
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
86
+ 0x0020, # White_Space # Zs SPACE
87
+ 0x0085, # White_Space # Cc <control-0085>
88
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
89
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
90
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
91
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
92
+ 0x2028, # White_Space # Zl LINE SEPARATOR
93
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
94
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
95
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
96
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
97
+ ].flatten.freeze
98
+
99
+ # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
100
+ # between little and big endian. This is not an issue in utf-8, so it must be ignored.
101
+ UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
102
+
103
+ # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
104
+ UTF8_PAT = /\A(?:
105
+ [\x00-\x7f] |
106
+ [\xc2-\xdf] [\x80-\xbf] |
107
+ \xe0 [\xa0-\xbf] [\x80-\xbf] |
108
+ [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] |
109
+ \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
110
+ [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
111
+ \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
112
+ )*\z/xn
113
+
114
+ # Returns a regular expression pattern that matches the passed Unicode codepoints
115
+ def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
116
+ array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
117
+ end
118
+ UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
119
+ UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
120
+
121
+ class << self
122
+
123
+ # ///
124
+ # /// BEGIN String method overrides
125
+ # ///
126
+
127
+ # Inserts the passed string at specified codepoint offsets
128
+ def insert(str, offset, fragment)
129
+ str.replace(
130
+ u_unpack(str).insert(
131
+ offset,
132
+ u_unpack(fragment)
133
+ ).flatten.pack('U*')
134
+ )
135
+ end
136
+
137
+ # Returns the position of the passed argument in the string, counting in codepoints
138
+ def index(str, *args)
139
+ bidx = str.index(*args)
140
+ bidx ? (u_unpack(str.slice(0...bidx)).size) : nil
141
+ end
142
+
143
+ # Works just like the indexed replace method on string, except instead of byte offsets you specify
144
+ # character offsets.
145
+ #
146
+ # Example:
147
+ #
148
+ # s = "Müller"
149
+ # s.chars[2] = "e" # Replace character with offset 2
150
+ # s # => "Müeler"
151
+ #
152
+ # s = "Müller"
153
+ # s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1
154
+ # s # => "Möler"
155
+ def []=(str, *args)
156
+ replace_by = args.pop
157
+ # Indexed replace with regular expressions already works
158
+ return str[*args] = replace_by if args.first.is_a?(Regexp)
159
+ result = u_unpack(str)
160
+ if args[0].is_a?(Fixnum)
161
+ raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
162
+ min = args[0]
163
+ max = args[1].nil? ? min : (min + args[1] - 1)
164
+ range = Range.new(min, max)
165
+ replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
166
+ elsif args.first.is_a?(Range)
167
+ raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
168
+ range = args[0]
169
+ else
170
+ needle = args[0].to_s
171
+ min = index(str, needle)
172
+ max = min + length(needle) - 1
173
+ range = Range.new(min, max)
174
+ end
175
+ result[range] = u_unpack(replace_by)
176
+ str.replace(result.pack('U*'))
177
+ end
178
+
179
+ # Works just like String#rjust, only integer specifies characters instead of bytes.
180
+ #
181
+ # Example:
182
+ #
183
+ # "¾ cup".chars.rjust(8).to_s
184
+ # # => " ¾ cup"
185
+ #
186
+ # "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
187
+ # # => "   ¾ cup"
188
+ def rjust(str, integer, padstr=' ')
189
+ justify(str, integer, :right, padstr)
190
+ end
191
+
192
+ # Works just like String#ljust, only integer specifies characters instead of bytes.
193
+ #
194
+ # Example:
195
+ #
196
+ # "¾ cup".chars.rjust(8).to_s
197
+ # # => "¾ cup "
198
+ #
199
+ # "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace
200
+ # # => "¾ cup   "
201
+ def ljust(str, integer, padstr=' ')
202
+ justify(str, integer, :left, padstr)
203
+ end
204
+
205
+ # Works just like String#center, only integer specifies characters instead of bytes.
206
+ #
207
+ # Example:
208
+ #
209
+ # "¾ cup".chars.center(8).to_s
210
+ # # => " ¾ cup "
211
+ #
212
+ # "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace
213
+ # # => " ¾ cup  "
214
+ def center(str, integer, padstr=' ')
215
+ justify(str, integer, :center, padstr)
216
+ end
217
+
218
+ # Does Unicode-aware rstrip
219
+ def rstrip(str)
220
+ str.gsub(UNICODE_TRAILERS_PAT, '')
221
+ end
222
+
223
+ # Does Unicode-aware lstrip
224
+ def lstrip(str)
225
+ str.gsub(UNICODE_LEADERS_PAT, '')
226
+ end
227
+
228
+ # Removed leading and trailing whitespace
229
+ def strip(str)
230
+ str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '')
231
+ end
232
+
233
+ # Returns the number of codepoints in the string
234
+ def size(str)
235
+ u_unpack(str).size
236
+ end
237
+ alias_method :length, :size
238
+
239
+ # Reverses codepoints in the string.
240
+ def reverse(str)
241
+ u_unpack(str).reverse.pack('U*')
242
+ end
243
+
244
+ # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
245
+ # character.
246
+ def slice(str, *args)
247
+ if args.size > 2
248
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
249
+ elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
250
+ raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
251
+ elsif (args.size == 2 && !args[1].is_a?(Numeric))
252
+ raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
253
+ elsif args[0].kind_of? Range
254
+ cps = u_unpack(str).slice(*args)
255
+ cps.nil? ? nil : cps.pack('U*')
256
+ elsif args[0].kind_of? Regexp
257
+ str.slice(*args)
258
+ elsif args.size == 1 && args[0].kind_of?(Numeric)
259
+ u_unpack(str)[args[0]]
260
+ else
261
+ u_unpack(str).slice(*args).pack('U*')
262
+ end
263
+ end
264
+ alias_method :[], :slice
265
+
266
+ # Convert characters in the string to uppercase
267
+ def upcase(str); to_case :uppercase_mapping, str; end
268
+
269
+ # Convert characters in the string to lowercase
270
+ def downcase(str); to_case :lowercase_mapping, str; end
271
+
272
+ # Returns a copy of +str+ with the first character converted to uppercase and the remainder to lowercase
273
+ def capitalize(str)
274
+ upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '')
275
+ end
276
+
277
+ # ///
278
+ # /// Extra String methods for unicode operations
279
+ # ///
280
+
281
+ # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
282
+ # passing strings to databases and validations.
283
+ #
284
+ # * <tt>str</tt> - The string to perform normalization on.
285
+ # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
286
+ # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
287
+ # ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM.
288
+ def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM)
289
+ # See http://www.unicode.org/reports/tr15, Table 1
290
+ codepoints = u_unpack(str)
291
+ case form
292
+ when :d
293
+ reorder_characters(decompose_codepoints(:canonical, codepoints))
294
+ when :c
295
+ compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints))
296
+ when :kd
297
+ reorder_characters(decompose_codepoints(:compatability, codepoints))
298
+ when :kc
299
+ compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints))
300
+ else
301
+ raise ArgumentError, "#{form} is not a valid normalization variant", caller
302
+ end.pack('U*')
303
+ end
304
+
305
+ # Perform decomposition on the characters in the string
306
+ def decompose(str)
307
+ decompose_codepoints(:canonical, u_unpack(str)).pack('U*')
308
+ end
309
+
310
+ # Perform composition on the characters in the string
311
+ def compose(str)
312
+ compose_codepoints u_unpack(str).pack('U*')
313
+ end
314
+
315
+ # ///
316
+ # /// BEGIN Helper methods for unicode operation
317
+ # ///
318
+
319
+ # Used to translate an offset from bytes to characters, for instance one received from a regular expression match
320
+ def translate_offset(str, byte_offset)
321
+ return nil if byte_offset.nil?
322
+ return 0 if str == ''
323
+ chunk = str[0..byte_offset]
324
+ begin
325
+ begin
326
+ chunk.unpack('U*').length - 1
327
+ rescue ArgumentError => e
328
+ chunk = str[0..(byte_offset+=1)]
329
+ # Stop retrying at the end of the string
330
+ raise e unless byte_offset < chunk.length
331
+ # We damaged a character, retry
332
+ retry
333
+ end
334
+ # Catch the ArgumentError so we can throw our own
335
+ rescue ArgumentError
336
+ raise EncodingError.new('malformed UTF-8 character')
337
+ end
338
+ end
339
+
340
+ # Checks if the string is valid UTF8.
341
+ def consumes?(str)
342
+ # Unpack is a little bit faster than regular expressions
343
+ begin
344
+ str.unpack('U*')
345
+ true
346
+ rescue ArgumentError
347
+ false
348
+ end
349
+ end
350
+
351
+ # Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed
352
+ # in future versions.
353
+ def g_length(str)
354
+ g_unpack(str).length
355
+ end
356
+
357
+ # Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
358
+ def tidy_bytes(str)
359
+ str.split(//u).map do |c|
360
+ if !UTF8_PAT.match(c)
361
+ n = c.unpack('C')[0]
362
+ n < 128 ? n.chr :
363
+ n < 160 ? [UCD.cp1252[n] || n].pack('U') :
364
+ n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
365
+ else
366
+ c
367
+ end
368
+ end.join
369
+ end
370
+
371
+ protected
372
+
373
+ # Detect whether the codepoint is in a certain character class. Primarily used by the
374
+ # grapheme cluster support.
375
+ def in_char_class?(codepoint, classes)
376
+ classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
377
+ end
378
+
379
+ # Unpack the string at codepoints boundaries
380
+ def u_unpack(str)
381
+ begin
382
+ str.unpack 'U*'
383
+ rescue ArgumentError
384
+ raise EncodingError.new('malformed UTF-8 character')
385
+ end
386
+ end
387
+
388
+ # Unpack the string at grapheme boundaries instead of codepoint boundaries
389
+ def g_unpack(str)
390
+ codepoints = u_unpack(str)
391
+ unpacked = []
392
+ pos = 0
393
+ marker = 0
394
+ eoc = codepoints.length
395
+ while(pos < eoc)
396
+ pos += 1
397
+ previous = codepoints[pos-1]
398
+ current = codepoints[pos]
399
+ if (
400
+ # CR X LF
401
+ one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
402
+ # L X (L|V|LV|LVT)
403
+ two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
404
+ # (LV|V) X (V|T)
405
+ three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
406
+ # (LVT|T) X (T)
407
+ four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
408
+ # X Extend
409
+ five = (UCD.boundary[:extend] === current)
410
+ )
411
+ else
412
+ unpacked << codepoints[marker..pos-1]
413
+ marker = pos
414
+ end
415
+ end
416
+ unpacked
417
+ end
418
+
419
+ # Reverse operation of g_unpack
420
+ def g_pack(unpacked)
421
+ unpacked.flatten
422
+ end
423
+
424
+ # Justifies a string in a certain way. Valid values for <tt>way</tt> are <tt>:right</tt>, <tt>:left</tt> and
425
+ # <tt>:center</tt>. Is primarily used as a helper method by <tt>rjust</tt>, <tt>ljust</tt> and <tt>center</tt>.
426
+ def justify(str, integer, way, padstr=' ')
427
+ raise ArgumentError, "zero width padding" if padstr.length == 0
428
+ padsize = integer - size(str)
429
+ padsize = padsize > 0 ? padsize : 0
430
+ case way
431
+ when :right
432
+ str.dup.insert(0, padding(padsize, padstr))
433
+ when :left
434
+ str.dup.insert(-1, padding(padsize, padstr))
435
+ when :center
436
+ lpad = padding((padsize / 2.0).floor, padstr)
437
+ rpad = padding((padsize / 2.0).ceil, padstr)
438
+ str.dup.insert(0, lpad).insert(-1, rpad)
439
+ end
440
+ end
441
+
442
+ # Generates a padding string of a certain size.
443
+ def padding(padsize, padstr=' ')
444
+ if padsize != 0
445
+ slice(padstr * ((padsize / size(padstr)) + 1), 0, padsize)
446
+ else
447
+ ''
448
+ end
449
+ end
450
+
451
+ # Convert characters to a different case
452
+ def to_case(way, str)
453
+ u_unpack(str).map do |codepoint|
454
+ cp = UCD[codepoint]
455
+ unless cp.nil?
456
+ ncp = cp.send(way)
457
+ ncp > 0 ? ncp : codepoint
458
+ else
459
+ codepoint
460
+ end
461
+ end.pack('U*')
462
+ end
463
+
464
+ # Re-order codepoints so the string becomes canonical
465
+ def reorder_characters(codepoints)
466
+ length = codepoints.length- 1
467
+ pos = 0
468
+ while pos < length do
469
+ cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]]
470
+ if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
471
+ codepoints[pos..pos+1] = cp2.code, cp1.code
472
+ pos += (pos > 0 ? -1 : 1)
473
+ else
474
+ pos += 1
475
+ end
476
+ end
477
+ codepoints
478
+ end
479
+
480
+ # Decompose composed characters to the decomposed form
481
+ def decompose_codepoints(type, codepoints)
482
+ codepoints.inject([]) do |decomposed, cp|
483
+ # if it's a hangul syllable starter character
484
+ if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
485
+ sindex = cp - HANGUL_SBASE
486
+ ncp = [] # new codepoints
487
+ ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
488
+ ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
489
+ tindex = sindex % HANGUL_TCOUNT
490
+ ncp << (HANGUL_TBASE + tindex) unless tindex == 0
491
+ decomposed.concat ncp
492
+ # if the codepoint is decomposable in with the current decomposition type
493
+ elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability)
494
+ decomposed.concat decompose_codepoints(type, ncp.dup)
495
+ else
496
+ decomposed << cp
497
+ end
498
+ end
499
+ end
500
+
501
+ # Compose decomposed characters to the composed form
502
+ def compose_codepoints(codepoints)
503
+ pos = 0
504
+ eoa = codepoints.length - 1
505
+ starter_pos = 0
506
+ starter_char = codepoints[0]
507
+ previous_combining_class = -1
508
+ while pos < eoa
509
+ pos += 1
510
+ lindex = starter_char - HANGUL_LBASE
511
+ # -- Hangul
512
+ if 0 <= lindex and lindex < HANGUL_LCOUNT
513
+ vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
514
+ if 0 <= vindex and vindex < HANGUL_VCOUNT
515
+ tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
516
+ if 0 <= tindex and tindex < HANGUL_TCOUNT
517
+ j = starter_pos + 2
518
+ eoa -= 2
519
+ else
520
+ tindex = 0
521
+ j = starter_pos + 1
522
+ eoa -= 1
523
+ end
524
+ codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
525
+ end
526
+ starter_pos += 1
527
+ starter_char = codepoints[starter_pos]
528
+ # -- Other characters
529
+ else
530
+ current_char = codepoints[pos]
531
+ current = UCD[current_char]
532
+ if current.combining_class > previous_combining_class
533
+ if ref = UCD.composition_map[starter_char]
534
+ composition = ref[current_char]
535
+ else
536
+ composition = nil
537
+ end
538
+ unless composition.nil?
539
+ codepoints[starter_pos] = composition
540
+ starter_char = composition
541
+ codepoints.delete_at pos
542
+ eoa -= 1
543
+ pos -= 1
544
+ previous_combining_class = -1
545
+ else
546
+ previous_combining_class = current.combining_class
547
+ end
548
+ else
549
+ previous_combining_class = current.combining_class
550
+ end
551
+ if current.combining_class == 0
552
+ starter_pos = pos
553
+ starter_char = codepoints[pos]
554
+ end
555
+ end
556
+ end
557
+ codepoints
558
+ end
559
+
560
+ # UniCode Database
561
+ UCD = UnicodeDatabase.new
562
+ end
563
+ end
564
+ end