taps-taps 0.3.24

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,182 @@
1
+ require 'zlib'
2
+ require 'stringio'
3
+ require 'time'
4
+ require 'tempfile'
5
+ require 'rest_client'
6
+
7
+ require 'taps/errors'
8
+ require 'taps/chunksize'
9
+
10
+ module Taps
11
+ module Utils
12
+ extend self
13
+
14
+ def windows?
15
+ return @windows if defined?(@windows)
16
+ require 'rbconfig'
17
+ @windows = !!(::Config::CONFIG['host_os'] =~ /mswin|mingw/)
18
+ end
19
+
20
+ def bin(cmd)
21
+ cmd = "#{cmd}.cmd" if windows?
22
+ cmd
23
+ end
24
+
25
+ def checksum(data)
26
+ Zlib.crc32(data)
27
+ end
28
+
29
+ def valid_data?(data, crc32)
30
+ Zlib.crc32(data) == crc32.to_i
31
+ end
32
+
33
+ def base64encode(data)
34
+ [data].pack("m")
35
+ end
36
+
37
+ def base64decode(data)
38
+ data.unpack("m").first
39
+ end
40
+
41
+ def format_data(data, opts={})
42
+ return {} if data.size == 0
43
+ string_columns = opts[:string_columns] || []
44
+ schema = opts[:schema] || []
45
+ table = opts[:table]
46
+
47
+ max_lengths = schema.inject({}) do |hash, (column, meta)|
48
+ if meta[:db_type] =~ /^varchar\((\d+)\)/
49
+ hash.update(column => $1.to_i)
50
+ end
51
+ hash
52
+ end
53
+
54
+ header = data[0].keys
55
+ only_data = data.collect do |row|
56
+ row = blobs_to_string(row, string_columns)
57
+ row.each do |column, data|
58
+ if data.to_s.length > (max_lengths[column] || data.to_s.length)
59
+ raise Taps::InvalidData.new(<<-ERROR)
60
+ Detected data that exceeds the length limitation of its column. This is
61
+ generally due to the fact that SQLite does not enforce length restrictions.
62
+
63
+ Table : #{table}
64
+ Column : #{column}
65
+ Type : #{schema.detect{|s| s.first == column}.last[:db_type]}
66
+ Data : #{data}
67
+ ERROR
68
+ end
69
+ end
70
+ header.collect { |h| row[h] }
71
+ end
72
+ { :header => header, :data => only_data }
73
+ end
74
+
75
+ # mysql text and blobs fields are handled the same way internally
76
+ # this is not true for other databases so we must check if the field is
77
+ # actually text and manually convert it back to a string
78
+ def incorrect_blobs(db, table)
79
+ return [] if (db.url =~ /mysql:\/\//).nil?
80
+
81
+ columns = []
82
+ db.schema(table).each do |data|
83
+ column, cdata = data
84
+ columns << column if cdata[:db_type] =~ /text/
85
+ end
86
+ columns
87
+ end
88
+
89
+ def blobs_to_string(row, columns)
90
+ return row if columns.size == 0
91
+ columns.each do |c|
92
+ row[c] = row[c].to_s if row[c].kind_of?(Sequel::SQL::Blob)
93
+ end
94
+ row
95
+ end
96
+
97
+ def calculate_chunksize(old_chunksize)
98
+ c = Taps::Chunksize.new(old_chunksize)
99
+
100
+ begin
101
+ c.start_time = Time.now
102
+ c.time_in_db = yield c
103
+ rescue Errno::EPIPE, RestClient::RequestFailed, RestClient::RequestTimeout
104
+ c.retries += 1
105
+ raise if c.retries > 2
106
+
107
+ # we got disconnected, the chunksize could be too large
108
+ # reset the chunksize based on the number of retries
109
+ c.reset_chunksize
110
+ retry
111
+ end
112
+
113
+ c.end_time = Time.now
114
+ c.calc_new_chunksize
115
+ end
116
+
117
+ def load_schema(database_url, schema_data)
118
+ Tempfile.open('taps') do |tmp|
119
+ File.open(tmp.path, 'w') { |f| f.write(schema_data) }
120
+ schema_bin(:load, database_url, tmp.path)
121
+ end
122
+ end
123
+
124
+ def load_indexes(database_url, index_data)
125
+ Tempfile.open('taps') do |tmp|
126
+ File.open(tmp.path, 'w') { |f| f.write(index_data) }
127
+ schema_bin(:load_indexes, database_url, tmp.path)
128
+ end
129
+ end
130
+
131
+ def schema_bin(*args)
132
+ bin_path = File.expand_path("#{File.dirname(__FILE__)}/../../bin/#{bin('schema')}")
133
+ `"#{bin_path}" #{args.map { |a| "'#{a}'" }.join(' ')}`
134
+ end
135
+
136
+ def primary_key(db, table)
137
+ db.schema(table).select { |c| c[1][:primary_key] }.map { |c| c[0] }
138
+ end
139
+
140
+ def single_integer_primary_key(db, table)
141
+ table = table.to_sym.identifier unless table.kind_of?(Sequel::SQL::Identifier)
142
+ keys = db.schema(table).select { |c| c[1][:primary_key] and c[1][:type] == :integer }
143
+ not keys.nil? and keys.size == 1
144
+ end
145
+
146
+ def order_by(db, table)
147
+ pkey = primary_key(db, table)
148
+ if pkey
149
+ pkey.kind_of?(Array) ? pkey : [pkey.to_sym]
150
+ else
151
+ table = table.to_sym.identifier unless table.kind_of?(Sequel::SQL::Identifier)
152
+ db[table].columns
153
+ end
154
+ end
155
+
156
+
157
+ # try to detect server side errors to
158
+ # give the client a more useful error message
159
+ def server_error_handling(&blk)
160
+ begin
161
+ blk.call
162
+ rescue Sequel::DatabaseError => e
163
+ if e.message =~ /duplicate key value/i
164
+ raise Taps::DuplicatePrimaryKeyError, e.message
165
+ else
166
+ raise
167
+ end
168
+ end
169
+ end
170
+
171
+ def reraise_server_exception(e)
172
+ if e.kind_of?(RestClient::Exception)
173
+ if e.respond_to?(:response) && e.response.headers[:content_type] == 'application/json'
174
+ json = OkJson.decode(e.response.to_s)
175
+ klass = eval(json['error_class']) rescue nil
176
+ raise klass.new(json['error_message'], :backtrace => json['error_backtrace']) if klass
177
+ end
178
+ end
179
+ raise e
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,18 @@
1
+ require "yaml"
2
+
3
+ module Taps
4
+ def self.version_yml
5
+ @@version_yml ||= YAML.load(File.read(File.dirname(__FILE__) + '/../../VERSION.yml'))
6
+ end
7
+
8
+ def self.version
9
+ version = "#{version_yml[:major]}.#{version_yml[:minor]}.#{version_yml[:patch]}"
10
+ version += ".#{version_yml[:build]}" if version_yml[:build]
11
+ version
12
+ end
13
+
14
+ def self.compatible_version
15
+ "#{version_yml[:major]}.#{version_yml[:minor]}"
16
+ end
17
+ end
18
+
@@ -0,0 +1,555 @@
1
+ # Copyright 2011 Keith Rarick
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ # See https://github.com/kr/okjson for updates.
22
+
23
+ require 'stringio'
24
+
25
+ # Some parts adapted from
26
+ # http://golang.org/src/pkg/json/decode.go and
27
+ # http://golang.org/src/pkg/utf8/utf8.go
28
+ module OkJson
29
+ extend self
30
+
31
+ class ParserError < ::StandardError; end
32
+
33
+ # Decodes a json document in string s and
34
+ # returns the corresponding ruby value.
35
+ # String s must be valid UTF-8. If you have
36
+ # a string in some other encoding, convert
37
+ # it first.
38
+ #
39
+ # String values in the resulting structure
40
+ # will be UTF-8.
41
+ def decode(s)
42
+ ts = lex(s)
43
+ v, ts = textparse(ts)
44
+ if ts.length > 0
45
+ raise OkJson::ParserError, 'trailing garbage'
46
+ end
47
+ v
48
+ end
49
+
50
+
51
+ # Parses a "json text" in the sense of RFC 4627.
52
+ # Returns the parsed value and any trailing tokens.
53
+ # Note: this is almost the same as valparse,
54
+ # except that it does not accept atomic values.
55
+ def textparse(ts)
56
+ if ts.length < 0
57
+ raise OkJson::ParserError, 'empty'
58
+ end
59
+
60
+ typ, _, val = ts[0]
61
+ case typ
62
+ when '{' then objparse(ts)
63
+ when '[' then arrparse(ts)
64
+ else valparse(ts)
65
+ end
66
+ end
67
+
68
+
69
+ # Parses a "value" in the sense of RFC 4627.
70
+ # Returns the parsed value and any trailing tokens.
71
+ def valparse(ts)
72
+ if ts.length < 0
73
+ raise OkJson::ParserError, 'empty'
74
+ end
75
+
76
+ typ, _, val = ts[0]
77
+ case typ
78
+ when '{' then objparse(ts)
79
+ when '[' then arrparse(ts)
80
+ when :val,:str then [val, ts[1..-1]]
81
+ else
82
+ raise OkJson::ParserError, "unexpected #{val.inspect}"
83
+ end
84
+ end
85
+
86
+
87
+ # Parses an "object" in the sense of RFC 4627.
88
+ # Returns the parsed value and any trailing tokens.
89
+ def objparse(ts)
90
+ ts = eat('{', ts)
91
+ obj = {}
92
+
93
+ if ts[0][0] == '}'
94
+ return obj, ts[1..-1]
95
+ end
96
+
97
+ k, v, ts = pairparse(ts)
98
+ obj[k] = v
99
+
100
+ if ts[0][0] == '}'
101
+ return obj, ts[1..-1]
102
+ end
103
+
104
+ loop do
105
+ ts = eat(',', ts)
106
+
107
+ k, v, ts = pairparse(ts)
108
+ obj[k] = v
109
+
110
+ if ts[0][0] == '}'
111
+ return obj, ts[1..-1]
112
+ end
113
+ end
114
+ end
115
+
116
+
117
+ # Parses a "member" in the sense of RFC 4627.
118
+ # Returns the parsed value and any trailing tokens.
119
+ def pairparse(ts)
120
+ (typ, _, k), ts = ts[0], ts[1..-1]
121
+ if typ != :str
122
+ raise OkJson::ParserError, "unexpected #{k.inspect}"
123
+ end
124
+ ts = eat(':', ts)
125
+ v, ts = valparse(ts)
126
+ [k, v, ts]
127
+ end
128
+
129
+
130
+ # Parses an "array" in the sense of RFC 4627.
131
+ # Returns the parsed value and any trailing tokens.
132
+ def arrparse(ts)
133
+ ts = eat('[', ts)
134
+ arr = []
135
+
136
+ if ts[0][0] == ']'
137
+ return arr, ts[1..-1]
138
+ end
139
+
140
+ v, ts = valparse(ts)
141
+ arr << v
142
+
143
+ if ts[0][0] == ']'
144
+ return arr, ts[1..-1]
145
+ end
146
+
147
+ loop do
148
+ ts = eat(',', ts)
149
+
150
+ v, ts = valparse(ts)
151
+ arr << v
152
+
153
+ if ts[0][0] == ']'
154
+ return arr, ts[1..-1]
155
+ end
156
+ end
157
+ end
158
+
159
+
160
+ def eat(typ, ts)
161
+ if ts[0][0] != typ
162
+ raise OkJson::ParserError, "expected #{typ} (got #{ts[0].inspect})"
163
+ end
164
+ ts[1..-1]
165
+ end
166
+
167
+
168
+ # Sans s and returns a list of json tokens,
169
+ # excluding white space (as defined in RFC 4627).
170
+ def lex(s)
171
+ ts = []
172
+ while s.length > 0
173
+ typ, lexeme, val = tok(s)
174
+ if typ == nil
175
+ raise OkJson::ParserError, "invalid character at #{s[0,10].inspect}"
176
+ end
177
+ if typ != :space
178
+ ts << [typ, lexeme, val]
179
+ end
180
+ s = s[lexeme.length..-1]
181
+ end
182
+ ts
183
+ end
184
+
185
+
186
+ # Scans the first token in s and
187
+ # returns a 3-element list, or nil
188
+ # if no such token exists.
189
+ #
190
+ # The first list element is one of
191
+ # '{', '}', ':', ',', '[', ']',
192
+ # :val, :str, and :space.
193
+ #
194
+ # The second element is the lexeme.
195
+ #
196
+ # The third element is the value of the
197
+ # token for :val and :str, otherwise
198
+ # it is the lexeme.
199
+ def tok(s)
200
+ case s[0]
201
+ when ?{ then ['{', s[0,1], s[0,1]]
202
+ when ?} then ['}', s[0,1], s[0,1]]
203
+ when ?: then [':', s[0,1], s[0,1]]
204
+ when ?, then [',', s[0,1], s[0,1]]
205
+ when ?[ then ['[', s[0,1], s[0,1]]
206
+ when ?] then [']', s[0,1], s[0,1]]
207
+ when ?n then nulltok(s)
208
+ when ?t then truetok(s)
209
+ when ?f then falsetok(s)
210
+ when ?" then strtok(s)
211
+ when Spc then [:space, s[0,1], s[0,1]]
212
+ when ?\t then [:space, s[0,1], s[0,1]]
213
+ when ?\n then [:space, s[0,1], s[0,1]]
214
+ when ?\r then [:space, s[0,1], s[0,1]]
215
+ else numtok(s)
216
+ end
217
+ end
218
+
219
+
220
+ def nulltok(s); s[0,4] == 'null' && [:val, 'null', nil] end
221
+ def truetok(s); s[0,4] == 'true' && [:val, 'true', true] end
222
+ def falsetok(s); s[0,5] == 'false' && [:val, 'false', false] end
223
+
224
+
225
+ def numtok(s)
226
+ m = /-?([1-9][0-9]+|[0-9])([.][0-9]+)?([eE][+-]?[0-9]+)?/.match(s)
227
+ if m && m.begin(0) == 0
228
+ if m[3] && !m[2]
229
+ [:val, m[0], Integer(m[1])*(10**Integer(m[3][1..-1]))]
230
+ elsif m[2]
231
+ [:val, m[0], Float(m[0])]
232
+ else
233
+ [:val, m[0], Integer(m[0])]
234
+ end
235
+ end
236
+ end
237
+
238
+
239
+ def strtok(s)
240
+ m = /"([^"\\]|\\["\/\\bfnrt]|\\u[0-9a-fA-F]{4})*"/.match(s)
241
+ if ! m
242
+ raise OkJson::ParserError, "invalid string literal at #{abbrev(s)}"
243
+ end
244
+ [:str, m[0], unquote(m[0])]
245
+ end
246
+
247
+
248
+ def abbrev(s)
249
+ t = s[0,10]
250
+ p = t['`']
251
+ t = t[0,p] if p
252
+ t = t + '...' if t.length < s.length
253
+ '`' + t + '`'
254
+ end
255
+
256
+
257
+ # Converts a quoted json string literal q into a UTF-8-encoded string.
258
+ # The rules are different than for Ruby, so we cannot use eval.
259
+ # Unquote will raise OkJson::ParserError, an error if q contains control characters.
260
+ def unquote(q)
261
+ q = q[1...-1]
262
+ a = q.dup # allocate a big enough string
263
+ r, w = 0, 0
264
+ while r < q.length
265
+ c = q[r]
266
+ case true
267
+ when c == ?\\
268
+ r += 1
269
+ if r >= q.length
270
+ raise OkJson::ParserError, "string literal ends with a \"\\\": \"#{q}\""
271
+ end
272
+
273
+ case q[r]
274
+ when ?",?\\,?/,?'
275
+ a[w] = q[r]
276
+ r += 1
277
+ w += 1
278
+ when ?b,?f,?n,?r,?t
279
+ a[w] = Unesc[q[r]]
280
+ r += 1
281
+ w += 1
282
+ when ?u
283
+ r += 1
284
+ uchar = begin
285
+ hexdec4(q[r,4])
286
+ rescue RuntimeError => e
287
+ raise OkJson::ParserError, "invalid escape sequence \\u#{q[r,4]}: #{e}"
288
+ end
289
+ r += 4
290
+ if surrogate? uchar
291
+ if q.length >= r+6
292
+ uchar1 = hexdec4(q[r+2,4])
293
+ uchar = subst(uchar, uchar1)
294
+ if uchar != Ucharerr
295
+ # A valid pair; consume.
296
+ r += 6
297
+ end
298
+ end
299
+ end
300
+ w += ucharenc(a, w, uchar)
301
+ else
302
+ raise OkJson::ParserError, "invalid escape char #{q[r]} in \"#{q}\""
303
+ end
304
+ when c == ?", c < Spc
305
+ raise OkJson::ParserError, "invalid character in string literal \"#{q}\""
306
+ else
307
+ # Copy anything else byte-for-byte.
308
+ # Valid UTF-8 will remain valid UTF-8.
309
+ # Invalid UTF-8 will remain invalid UTF-8.
310
+ a[w] = c
311
+ r += 1
312
+ w += 1
313
+ end
314
+ end
315
+ a[0,w]
316
+ end
317
+
318
+
319
+ def hexdec4(s)
320
+ if s.length != 4
321
+ raise OkJson::ParserError, 'short'
322
+ end
323
+ (nibble(s[0])<<12) | (nibble(s[1])<<8) | (nibble(s[2])<<4) | nibble(s[3])
324
+ end
325
+
326
+
327
+ def subst(u1, u2)
328
+ if Usurr1 <= u1 && u1 < Usurr2 && Usurr2 <= u2 && u2 < Usurr3
329
+ return ((u1-Usurr1)<<10) | (u2-Usurr2) + Usurrself
330
+ end
331
+ return Ucharerr
332
+ end
333
+
334
+
335
+ def unsubst(u)
336
+ if u < Usurrself || u > Umax || surrogate?(u)
337
+ return Ucharerr, Ucharerr
338
+ end
339
+ u -= Usurrself
340
+ [Usurr1 + ((u>>10)&0x3ff), Usurr2 + (u&0x3ff)]
341
+ end
342
+
343
+
344
+ def surrogate?(u)
345
+ Usurr1 <= u && u < Usurr3
346
+ end
347
+
348
+
349
+ def nibble(c)
350
+ case true
351
+ when ?0 <= c && c <= ?9 then c.ord - ?0.ord
352
+ when ?a <= c && c <= ?z then c.ord - ?a.ord + 10
353
+ when ?A <= c && c <= ?Z then c.ord - ?A.ord + 10
354
+ else
355
+ raise OkJson::ParserError, "invalid hex code #{c}"
356
+ end
357
+ end
358
+
359
+
360
+ # Encodes x into a json text. It may contain only
361
+ # Array, Hash, String, Numeric, true, false, nil.
362
+ # (Note, this list excludes Symbol.)
363
+ # Strings contained in x must be valid UTF-8.
364
+ # Values that cannot be represented, such as
365
+ # Nan, Infinity, Symbol, and Proc, are encoded
366
+ # as null, in accordance with ECMA-262, 5th ed.
367
+ def encode(x)
368
+ case x
369
+ when Hash then objenc(x)
370
+ when Array then arrenc(x)
371
+ when String then strenc(x)
372
+ when Numeric then numenc(x)
373
+ when Symbol then strenc(x.to_s)
374
+ when true then "true"
375
+ when false then "false"
376
+ when nil then "null"
377
+ else "null"
378
+ end
379
+ end
380
+
381
+
382
+ def objenc(x)
383
+ '{' + x.map{|k,v| encode(k) + ':' + encode(v)}.join(',') + '}'
384
+ end
385
+
386
+
387
+ def arrenc(a)
388
+ '[' + a.map{|x| encode(x)}.join(',') + ']'
389
+ end
390
+
391
+
392
+ def strenc(s)
393
+ t = StringIO.new
394
+ t.putc(?")
395
+ r = 0
396
+ while r < s.length
397
+ case s[r]
398
+ when ?" then t.print('\\"')
399
+ when ?\\ then t.print('\\\\')
400
+ when ?\b then t.print('\\b')
401
+ when ?\f then t.print('\\f')
402
+ when ?\n then t.print('\\n')
403
+ when ?\r then t.print('\\r')
404
+ when ?\t then t.print('\\t')
405
+ else
406
+ c = s[r]
407
+ case true
408
+ when Spc <= c && c <= ?~
409
+ t.putc(c)
410
+ when true
411
+ u, size = uchardec(s, r)
412
+ r += size - 1 # we add one more at the bottom of the loop
413
+ if u < 0x10000
414
+ t.print('\\u')
415
+ hexenc4(t, u)
416
+ else
417
+ u1, u2 = unsubst(u)
418
+ t.print('\\u')
419
+ hexenc4(t, u1)
420
+ t.print('\\u')
421
+ hexenc4(t, u2)
422
+ end
423
+ else
424
+ # invalid byte; skip it
425
+ end
426
+ end
427
+ r += 1
428
+ end
429
+ t.putc(?")
430
+ t.string
431
+ end
432
+
433
+
434
+ def hexenc4(t, u)
435
+ t.putc(Hex[(u>>12)&0xf])
436
+ t.putc(Hex[(u>>8)&0xf])
437
+ t.putc(Hex[(u>>4)&0xf])
438
+ t.putc(Hex[u&0xf])
439
+ end
440
+
441
+
442
+ def numenc(x)
443
+ if x.nan? || x.infinite?
444
+ return 'null'
445
+ end rescue nil
446
+ "#{x}"
447
+ end
448
+
449
+
450
+ # Decodes unicode character u from UTF-8
451
+ # bytes in string s at position i.
452
+ # Returns u and the number of bytes read.
453
+ def uchardec(s, i)
454
+ n = s.length - i
455
+ return [Ucharerr, 1] if n < 1
456
+
457
+ c0 = s[i].ord
458
+
459
+ # 1-byte, 7-bit sequence?
460
+ if c0 < Utagx
461
+ return [c0, 1]
462
+ end
463
+
464
+ # unexpected continuation byte?
465
+ return [Ucharerr, 1] if c0 < Utag2
466
+
467
+ # need continuation byte
468
+ return [Ucharerr, 1] if n < 2
469
+ c1 = s[i+1].ord
470
+ return [Ucharerr, 1] if c1 < Utagx || Utag2 <= c1
471
+
472
+ # 2-byte, 11-bit sequence?
473
+ if c0 < Utag3
474
+ u = (c0&Umask2)<<6 | (c1&Umaskx)
475
+ return [Ucharerr, 1] if u <= Uchar1max
476
+ return [u, 2]
477
+ end
478
+
479
+ # need second continuation byte
480
+ return [Ucharerr, 1] if n < 3
481
+ c2 = s[i+2].ord
482
+ return [Ucharerr, 1] if c2 < Utagx || Utag2 <= c2
483
+
484
+ # 3-byte, 16-bit sequence?
485
+ if c0 < Utag4
486
+ u = (c0&Umask3)<<12 | (c1&Umaskx)<<6 | (c2&Umaskx)
487
+ return [Ucharerr, 1] if u <= Uchar2max
488
+ return [u, 3]
489
+ end
490
+
491
+ # need third continuation byte
492
+ return [Ucharerr, 1] if n < 4
493
+ c3 = s[i+3].ord
494
+ return [Ucharerr, 1] if c3 < Utagx || Utag2 <= c3
495
+
496
+ # 4-byte, 21-bit sequence?
497
+ if c0 < Utag5
498
+ u = (c0&Umask4)<<18 | (c1&Umaskx)<<12 | (c2&Umaskx)<<6 | (c3&Umaskx)
499
+ return [Ucharerr, 1] if u <= Uchar3max
500
+ return [u, 4]
501
+ end
502
+
503
+ return [Ucharerr, 1]
504
+ end
505
+
506
+
507
+ # Encodes unicode character u as UTF-8
508
+ # bytes in string a at position i.
509
+ # Returns the number of bytes written.
510
+ def ucharenc(a, i, u)
511
+ case true
512
+ when u <= Uchar1max
513
+ a[i] = (u & 0xff).chr
514
+ 1
515
+ when u <= Uchar2max
516
+ a[i+0] = (Utag2 | ((u>>6)&0xff)).chr
517
+ a[i+1] = (Utagx | (u&Umaskx)).chr
518
+ 2
519
+ when u <= Uchar3max
520
+ a[i+0] = (Utag3 | ((u>>12)&0xff)).chr
521
+ a[i+1] = (Utagx | ((u>>6)&Umaskx)).chr
522
+ a[i+2] = (Utagx | (u&Umaskx)).chr
523
+ 3
524
+ else
525
+ a[i+0] = (Utag4 | ((u>>18)&0xff)).chr
526
+ a[i+1] = (Utagx | ((u>>12)&Umaskx)).chr
527
+ a[i+2] = (Utagx | ((u>>6)&Umaskx)).chr
528
+ a[i+3] = (Utagx | (u&Umaskx)).chr
529
+ 4
530
+ end
531
+ end
532
+
533
+ Utagx = 0x80 # 1000 0000
534
+ Utag2 = 0xc0 # 1100 0000
535
+ Utag3 = 0xe0 # 1110 0000
536
+ Utag4 = 0xf0 # 1111 0000
537
+ Utag5 = 0xF8 # 1111 1000
538
+ Umaskx = 0x3f # 0011 1111
539
+ Umask2 = 0x1f # 0001 1111
540
+ Umask3 = 0x0f # 0000 1111
541
+ Umask4 = 0x07 # 0000 0111
542
+ Uchar1max = (1<<7) - 1
543
+ Uchar2max = (1<<11) - 1
544
+ Uchar3max = (1<<16) - 1
545
+ Ucharerr = 0xFFFD # unicode "replacement char"
546
+ Usurrself = 0x10000
547
+ Usurr1 = 0xd800
548
+ Usurr2 = 0xdc00
549
+ Usurr3 = 0xe000
550
+ Umax = 0x10ffff
551
+
552
+ Spc = ' '[0]
553
+ Unesc = {?b=>?\b, ?f=>?\f, ?n=>?\n, ?r=>?\r, ?t=>?\t}
554
+ Hex = '0123456789abcdef'
555
+ end