transcriptic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ module Transcriptic
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,557 @@
1
+ # Copyright 2011 Keith Rarick
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ # See https://github.com/kr/okjson for updates.
22
+
23
+ require 'stringio'
24
+
25
+ # Some parts adapted from
26
+ # http://golang.org/src/pkg/json/decode.go and
27
+ # http://golang.org/src/pkg/utf8/utf8.go
28
+ module Transcriptic
29
+ module OkJson
30
+ extend self
31
+
32
+ class ParserError < ::StandardError; end
33
+
34
+ # Decodes a json document in string s and
35
+ # returns the corresponding ruby value.
36
+ # String s must be valid UTF-8. If you have
37
+ # a string in some other encoding, convert
38
+ # it first.
39
+ #
40
+ # String values in the resulting structure
41
+ # will be UTF-8.
42
+ def decode(s)
43
+ ts = lex(s)
44
+ v, ts = textparse(ts)
45
+ if ts.length > 0
46
+ raise Heroku::OkJson::ParserError, 'trailing garbage'
47
+ end
48
+ v
49
+ end
50
+
51
+
52
+ # Parses a "json text" in the sense of RFC 4627.
53
+ # Returns the parsed value and any trailing tokens.
54
+ # Note: this is almost the same as valparse,
55
+ # except that it does not accept atomic values.
56
+ def textparse(ts)
57
+ if ts.length < 0
58
+ raise Heroku::OkJson::ParserError, 'empty'
59
+ end
60
+
61
+ typ, _, val = ts[0]
62
+ case typ
63
+ when '{' then objparse(ts)
64
+ when '[' then arrparse(ts)
65
+ else valparse(ts)
66
+ end
67
+ end
68
+
69
+
70
+ # Parses a "value" in the sense of RFC 4627.
71
+ # Returns the parsed value and any trailing tokens.
72
+ def valparse(ts)
73
+ if ts.length < 0
74
+ raise Heroku::OkJson::ParserError, 'empty'
75
+ end
76
+
77
+ typ, _, val = ts[0]
78
+ case typ
79
+ when '{' then objparse(ts)
80
+ when '[' then arrparse(ts)
81
+ when :val,:str then [val, ts[1..-1]]
82
+ else
83
+ raise Heroku::OkJson::ParserError, "unexpected #{val.inspect}"
84
+ end
85
+ end
86
+
87
+
88
+ # Parses an "object" in the sense of RFC 4627.
89
+ # Returns the parsed value and any trailing tokens.
90
+ def objparse(ts)
91
+ ts = eat('{', ts)
92
+ obj = {}
93
+
94
+ if ts[0][0] == '}'
95
+ return obj, ts[1..-1]
96
+ end
97
+
98
+ k, v, ts = pairparse(ts)
99
+ obj[k] = v
100
+
101
+ if ts[0][0] == '}'
102
+ return obj, ts[1..-1]
103
+ end
104
+
105
+ loop do
106
+ ts = eat(',', ts)
107
+
108
+ k, v, ts = pairparse(ts)
109
+ obj[k] = v
110
+
111
+ if ts[0][0] == '}'
112
+ return obj, ts[1..-1]
113
+ end
114
+ end
115
+ end
116
+
117
+
118
+ # Parses a "member" in the sense of RFC 4627.
119
+ # Returns the parsed value and any trailing tokens.
120
+ def pairparse(ts)
121
+ (typ, _, k), ts = ts[0], ts[1..-1]
122
+ if typ != :str
123
+ raise Heroku::OkJson::ParserError, "unexpected #{k.inspect}"
124
+ end
125
+ ts = eat(':', ts)
126
+ v, ts = valparse(ts)
127
+ [k, v, ts]
128
+ end
129
+
130
+
131
+ # Parses an "array" in the sense of RFC 4627.
132
+ # Returns the parsed value and any trailing tokens.
133
+ def arrparse(ts)
134
+ ts = eat('[', ts)
135
+ arr = []
136
+
137
+ if ts[0][0] == ']'
138
+ return arr, ts[1..-1]
139
+ end
140
+
141
+ v, ts = valparse(ts)
142
+ arr << v
143
+
144
+ if ts[0][0] == ']'
145
+ return arr, ts[1..-1]
146
+ end
147
+
148
+ loop do
149
+ ts = eat(',', ts)
150
+
151
+ v, ts = valparse(ts)
152
+ arr << v
153
+
154
+ if ts[0][0] == ']'
155
+ return arr, ts[1..-1]
156
+ end
157
+ end
158
+ end
159
+
160
+
161
+ def eat(typ, ts)
162
+ if ts[0][0] != typ
163
+ raise Heroku::OkJson::ParserError, "expected #{typ} (got #{ts[0].inspect})"
164
+ end
165
+ ts[1..-1]
166
+ end
167
+
168
+
169
+ # Sans s and returns a list of json tokens,
170
+ # excluding white space (as defined in RFC 4627).
171
+ def lex(s)
172
+ ts = []
173
+ while s.length > 0
174
+ typ, lexeme, val = tok(s)
175
+ if typ == nil
176
+ raise Heroku::OkJson::ParserError, "invalid character at #{s[0,10].inspect}"
177
+ end
178
+ if typ != :space
179
+ ts << [typ, lexeme, val]
180
+ end
181
+ s = s[lexeme.length..-1]
182
+ end
183
+ ts
184
+ end
185
+
186
+
187
+ # Scans the first token in s and
188
+ # returns a 3-element list, or nil
189
+ # if no such token exists.
190
+ #
191
+ # The first list element is one of
192
+ # '{', '}', ':', ',', '[', ']',
193
+ # :val, :str, and :space.
194
+ #
195
+ # The second element is the lexeme.
196
+ #
197
+ # The third element is the value of the
198
+ # token for :val and :str, otherwise
199
+ # it is the lexeme.
200
+ def tok(s)
201
+ case s[0]
202
+ when ?{ then ['{', s[0,1], s[0,1]]
203
+ when ?} then ['}', s[0,1], s[0,1]]
204
+ when ?: then [':', s[0,1], s[0,1]]
205
+ when ?, then [',', s[0,1], s[0,1]]
206
+ when ?[ then ['[', s[0,1], s[0,1]]
207
+ when ?] then [']', s[0,1], s[0,1]]
208
+ when ?n then nulltok(s)
209
+ when ?t then truetok(s)
210
+ when ?f then falsetok(s)
211
+ when ?" then strtok(s)
212
+ when Spc then [:space, s[0,1], s[0,1]]
213
+ when ?\t then [:space, s[0,1], s[0,1]]
214
+ when ?\n then [:space, s[0,1], s[0,1]]
215
+ when ?\r then [:space, s[0,1], s[0,1]]
216
+ else numtok(s)
217
+ end
218
+ end
219
+
220
+
221
+ def nulltok(s); s[0,4] == 'null' && [:val, 'null', nil] end
222
+ def truetok(s); s[0,4] == 'true' && [:val, 'true', true] end
223
+ def falsetok(s); s[0,5] == 'false' && [:val, 'false', false] end
224
+
225
+
226
+ def numtok(s)
227
+ m = /-?([1-9][0-9]+|[0-9])([.][0-9]+)?([eE][+-]?[0-9]+)?/.match(s)
228
+ if m && m.begin(0) == 0
229
+ if m[3] && !m[2]
230
+ [:val, m[0], Integer(m[1])*(10**Integer(m[3][1..-1]))]
231
+ elsif m[2]
232
+ [:val, m[0], Float(m[0])]
233
+ else
234
+ [:val, m[0], Integer(m[0])]
235
+ end
236
+ end
237
+ end
238
+
239
+
240
+ def strtok(s)
241
+ m = /"([^"\\]|\\["\/\\bfnrt]|\\u[0-9a-fA-F]{4})*"/.match(s)
242
+ if ! m
243
+ raise Heroku::OkJson::ParserError, "invalid string literal at #{abbrev(s)}"
244
+ end
245
+ [:str, m[0], unquote(m[0])]
246
+ end
247
+
248
+
249
+ def abbrev(s)
250
+ t = s[0,10]
251
+ p = t['`']
252
+ t = t[0,p] if p
253
+ t = t + '...' if t.length < s.length
254
+ '`' + t + '`'
255
+ end
256
+
257
+
258
+ # Converts a quoted json string literal q into a UTF-8-encoded string.
259
+ # The rules are different than for Ruby, so we cannot use eval.
260
+ # Unquote will raise Heroku::OkJson::ParserError, an error if q contains control characters.
261
+ def unquote(q)
262
+ q = q[1...-1]
263
+ a = q.dup # allocate a big enough string
264
+ r, w = 0, 0
265
+ while r < q.length
266
+ c = q[r]
267
+ case true
268
+ when c == ?\\
269
+ r += 1
270
+ if r >= q.length
271
+ raise Heroku::OkJson::ParserError, "string literal ends with a \"\\\": \"#{q}\""
272
+ end
273
+
274
+ case q[r]
275
+ when ?",?\\,?/,?'
276
+ a[w] = q[r]
277
+ r += 1
278
+ w += 1
279
+ when ?b,?f,?n,?r,?t
280
+ a[w] = Unesc[q[r]]
281
+ r += 1
282
+ w += 1
283
+ when ?u
284
+ r += 1
285
+ uchar = begin
286
+ hexdec4(q[r,4])
287
+ rescue RuntimeError => e
288
+ raise Heroku::OkJson::ParserError, "invalid escape sequence \\u#{q[r,4]}: #{e}"
289
+ end
290
+ r += 4
291
+ if surrogate? uchar
292
+ if q.length >= r+6
293
+ uchar1 = hexdec4(q[r+2,4])
294
+ uchar = subst(uchar, uchar1)
295
+ if uchar != Ucharerr
296
+ # A valid pair; consume.
297
+ r += 6
298
+ end
299
+ end
300
+ end
301
+ w += ucharenc(a, w, uchar)
302
+ else
303
+ raise Heroku::OkJson::ParserError, "invalid escape char #{q[r]} in \"#{q}\""
304
+ end
305
+ when c == ?", c < Spc
306
+ raise Heroku::OkJson::ParserError, "invalid character in string literal \"#{q}\""
307
+ else
308
+ # Copy anything else byte-for-byte.
309
+ # Valid UTF-8 will remain valid UTF-8.
310
+ # Invalid UTF-8 will remain invalid UTF-8.
311
+ a[w] = c
312
+ r += 1
313
+ w += 1
314
+ end
315
+ end
316
+ a[0,w]
317
+ end
318
+
319
+
320
+ def hexdec4(s)
321
+ if s.length != 4
322
+ raise Heroku::OkJson::ParserError, 'short'
323
+ end
324
+ (nibble(s[0])<<12) | (nibble(s[1])<<8) | (nibble(s[2])<<4) | nibble(s[3])
325
+ end
326
+
327
+
328
+ def subst(u1, u2)
329
+ if Usurr1 <= u1 && u1 < Usurr2 && Usurr2 <= u2 && u2 < Usurr3
330
+ return ((u1-Usurr1)<<10) | (u2-Usurr2) + Usurrself
331
+ end
332
+ return Ucharerr
333
+ end
334
+
335
+
336
+ def unsubst(u)
337
+ if u < Usurrself || u > Umax || surrogate?(u)
338
+ return Ucharerr, Ucharerr
339
+ end
340
+ u -= Usurrself
341
+ [Usurr1 + ((u>>10)&0x3ff), Usurr2 + (u&0x3ff)]
342
+ end
343
+
344
+
345
+ def surrogate?(u)
346
+ Usurr1 <= u && u < Usurr3
347
+ end
348
+
349
+
350
+ def nibble(c)
351
+ case true
352
+ when ?0 <= c && c <= ?9 then c.ord - ?0.ord
353
+ when ?a <= c && c <= ?z then c.ord - ?a.ord + 10
354
+ when ?A <= c && c <= ?Z then c.ord - ?A.ord + 10
355
+ else
356
+ raise Heroku::OkJson::ParserError, "invalid hex code #{c}"
357
+ end
358
+ end
359
+
360
+
361
+ # Encodes x into a json text. It may contain only
362
+ # Array, Hash, String, Numeric, true, false, nil.
363
+ # (Note, this list excludes Symbol.)
364
+ # Strings contained in x must be valid UTF-8.
365
+ # Values that cannot be represented, such as
366
+ # Nan, Infinity, Symbol, and Proc, are encoded
367
+ # as null, in accordance with ECMA-262, 5th ed.
368
+ def encode(x)
369
+ case x
370
+ when Hash then objenc(x)
371
+ when Array then arrenc(x)
372
+ when String then strenc(x)
373
+ when Numeric then numenc(x)
374
+ when Symbol then strenc(x.to_s)
375
+ when true then "true"
376
+ when false then "false"
377
+ when nil then "null"
378
+ else "null"
379
+ end
380
+ end
381
+
382
+
383
+ def objenc(x)
384
+ '{' + x.map{|k,v| encode(k) + ':' + encode(v)}.join(',') + '}'
385
+ end
386
+
387
+
388
+ def arrenc(a)
389
+ '[' + a.map{|x| encode(x)}.join(',') + ']'
390
+ end
391
+
392
+
393
+ def strenc(s)
394
+ t = StringIO.new
395
+ t.putc(?")
396
+ r = 0
397
+ while r < s.length
398
+ case s[r]
399
+ when ?" then t.print('\\"')
400
+ when ?\\ then t.print('\\\\')
401
+ when ?\b then t.print('\\b')
402
+ when ?\f then t.print('\\f')
403
+ when ?\n then t.print('\\n')
404
+ when ?\r then t.print('\\r')
405
+ when ?\t then t.print('\\t')
406
+ else
407
+ c = s[r]
408
+ case true
409
+ when Spc <= c && c <= ?~
410
+ t.putc(c)
411
+ when true
412
+ u, size = uchardec(s, r)
413
+ r += size - 1 # we add one more at the bottom of the loop
414
+ if u < 0x10000
415
+ t.print('\\u')
416
+ hexenc4(t, u)
417
+ else
418
+ u1, u2 = unsubst(u)
419
+ t.print('\\u')
420
+ hexenc4(t, u1)
421
+ t.print('\\u')
422
+ hexenc4(t, u2)
423
+ end
424
+ else
425
+ # invalid byte; skip it
426
+ end
427
+ end
428
+ r += 1
429
+ end
430
+ t.putc(?")
431
+ t.string
432
+ end
433
+
434
+
435
+ def hexenc4(t, u)
436
+ t.putc(Hex[(u>>12)&0xf])
437
+ t.putc(Hex[(u>>8)&0xf])
438
+ t.putc(Hex[(u>>4)&0xf])
439
+ t.putc(Hex[u&0xf])
440
+ end
441
+
442
+
443
+ def numenc(x)
444
+ if x.nan? || x.infinite?
445
+ return 'null'
446
+ end rescue nil
447
+ "#{x}"
448
+ end
449
+
450
+
451
+ # Decodes unicode character u from UTF-8
452
+ # bytes in string s at position i.
453
+ # Returns u and the number of bytes read.
454
+ def uchardec(s, i)
455
+ n = s.length - i
456
+ return [Ucharerr, 1] if n < 1
457
+
458
+ c0 = s[i].ord
459
+
460
+ # 1-byte, 7-bit sequence?
461
+ if c0 < Utagx
462
+ return [c0, 1]
463
+ end
464
+
465
+ # unexpected continuation byte?
466
+ return [Ucharerr, 1] if c0 < Utag2
467
+
468
+ # need continuation byte
469
+ return [Ucharerr, 1] if n < 2
470
+ c1 = s[i+1].ord
471
+ return [Ucharerr, 1] if c1 < Utagx || Utag2 <= c1
472
+
473
+ # 2-byte, 11-bit sequence?
474
+ if c0 < Utag3
475
+ u = (c0&Umask2)<<6 | (c1&Umaskx)
476
+ return [Ucharerr, 1] if u <= Uchar1max
477
+ return [u, 2]
478
+ end
479
+
480
+ # need second continuation byte
481
+ return [Ucharerr, 1] if n < 3
482
+ c2 = s[i+2].ord
483
+ return [Ucharerr, 1] if c2 < Utagx || Utag2 <= c2
484
+
485
+ # 3-byte, 16-bit sequence?
486
+ if c0 < Utag4
487
+ u = (c0&Umask3)<<12 | (c1&Umaskx)<<6 | (c2&Umaskx)
488
+ return [Ucharerr, 1] if u <= Uchar2max
489
+ return [u, 3]
490
+ end
491
+
492
+ # need third continuation byte
493
+ return [Ucharerr, 1] if n < 4
494
+ c3 = s[i+3].ord
495
+ return [Ucharerr, 1] if c3 < Utagx || Utag2 <= c3
496
+
497
+ # 4-byte, 21-bit sequence?
498
+ if c0 < Utag5
499
+ u = (c0&Umask4)<<18 | (c1&Umaskx)<<12 | (c2&Umaskx)<<6 | (c3&Umaskx)
500
+ return [Ucharerr, 1] if u <= Uchar3max
501
+ return [u, 4]
502
+ end
503
+
504
+ return [Ucharerr, 1]
505
+ end
506
+
507
+
508
+ # Encodes unicode character u as UTF-8
509
+ # bytes in string a at position i.
510
+ # Returns the number of bytes written.
511
+ def ucharenc(a, i, u)
512
+ case true
513
+ when u <= Uchar1max
514
+ a[i] = (u & 0xff).chr
515
+ 1
516
+ when u <= Uchar2max
517
+ a[i+0] = (Utag2 | ((u>>6)&0xff)).chr
518
+ a[i+1] = (Utagx | (u&Umaskx)).chr
519
+ 2
520
+ when u <= Uchar3max
521
+ a[i+0] = (Utag3 | ((u>>12)&0xff)).chr
522
+ a[i+1] = (Utagx | ((u>>6)&Umaskx)).chr
523
+ a[i+2] = (Utagx | (u&Umaskx)).chr
524
+ 3
525
+ else
526
+ a[i+0] = (Utag4 | ((u>>18)&0xff)).chr
527
+ a[i+1] = (Utagx | ((u>>12)&Umaskx)).chr
528
+ a[i+2] = (Utagx | ((u>>6)&Umaskx)).chr
529
+ a[i+3] = (Utagx | (u&Umaskx)).chr
530
+ 4
531
+ end
532
+ end
533
+
534
+ Utagx = 0x80 # 1000 0000
535
+ Utag2 = 0xc0 # 1100 0000
536
+ Utag3 = 0xe0 # 1110 0000
537
+ Utag4 = 0xf0 # 1111 0000
538
+ Utag5 = 0xF8 # 1111 1000
539
+ Umaskx = 0x3f # 0011 1111
540
+ Umask2 = 0x1f # 0001 1111
541
+ Umask3 = 0x0f # 0000 1111
542
+ Umask4 = 0x07 # 0000 0111
543
+ Uchar1max = (1<<7) - 1
544
+ Uchar2max = (1<<11) - 1
545
+ Uchar3max = (1<<16) - 1
546
+ Ucharerr = 0xFFFD # unicode "replacement char"
547
+ Usurrself = 0x10000
548
+ Usurr1 = 0xd800
549
+ Usurr2 = 0xdc00
550
+ Usurr3 = 0xe000
551
+ Umax = 0x10ffff
552
+
553
+ Spc = ' '[0]
554
+ Unesc = {?b=>?\b, ?f=>?\f, ?n=>?\n, ?r=>?\r, ?t=>?\t}
555
+ Hex = '0123456789abcdef'
556
+ end
557
+ end