RFC7159 7159
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +61 -0
- data/.rspec +4 -0
- data/.yardopts +2 -0
- data/Gemfile +46 -0
- data/LICENSE.txt +40 -0
- data/README.txt +899 -0
- data/RFC7159 +899 -0
- data/RFC7159.gemspec +85 -0
- data/Rakefile +82 -0
- data/lib/RFC7159.rb +80 -0
- data/lib/RFC7159/array.rb +134 -0
- data/lib/RFC7159/dumper.rb +231 -0
- data/lib/RFC7159/false.rb +79 -0
- data/lib/RFC7159/null.rb +79 -0
- data/lib/RFC7159/number.rb +149 -0
- data/lib/RFC7159/object.rb +140 -0
- data/lib/RFC7159/parser.ry +267 -0
- data/lib/RFC7159/string.rb +221 -0
- data/lib/RFC7159/true.rb +79 -0
- data/lib/RFC7159/value.rb +96 -0
- data/lib/RFC7159/version.rb +48 -0
- data/spec/RFC7159_spec.rb +275 -0
- data/spec/acceptance/README.txt +66 -0
- data/spec/acceptance/invalid/0001-ws/0001-verical-tab.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0002-null.txt +0 -0
- data/spec/acceptance/invalid/0001-ws/0003-space-in-number-1.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0004-space-in-number-2.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0005-space-in-number-3.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0006-space-in-number-4.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0007-space-in-number-5.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0008-space-in-number-6.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0009-space-in-literal.txt +1 -0
- data/spec/acceptance/invalid/0001-ws/0010-Unicode-LINE_SEPARATOR.txt +1 -0
- data/spec/acceptance/invalid/0002-comments/0001-C-style.txt +1 -0
- data/spec/acceptance/invalid/0002-comments/0002-C-plusplus-style.txt +2 -0
- data/spec/acceptance/invalid/0002-comments/0003-sh-style.txt +2 -0
- data/spec/acceptance/invalid/0002-comments/0004-python-docstring.txt +3 -0
- data/spec/acceptance/invalid/0002-comments/0005-SQL-style.txt +2 -0
- data/spec/acceptance/invalid/0002-comments/0006-BASIC-style.txt +2 -0
- data/spec/acceptance/invalid/0003-literals/0001-FALSE.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0002-NULL.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0003-TRUE.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0004-NUL.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0005-nil.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0006-undef.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0007-modifier.txt +1 -0
- data/spec/acceptance/invalid/0003-literals/0008-undefined.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0001-omit-zero.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0002-minus-dot.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0003-missing-frac.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0004-missing-exp.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0005-octal.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0006-hexadecimal.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0007-comma.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0008-perl-underscore.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0009-NaN.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0010-Inf.txt +1 -0
- data/spec/acceptance/invalid/0004-numbers/0011-Infinity.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0000-NUL.txt +0 -0
- data/spec/acceptance/invalid/0005-strings/0001-not-terminated.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0002-single-quote.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0003-back-quote.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0004-carriage-return.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0005-line-feed.txt +2 -0
- data/spec/acceptance/invalid/0005-strings/0006-unknown-escape-a.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0007-unknown-escape-perl-style.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0008-unknown-escape-C-style.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0009-unknown-escape-ruby-style.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0010-escape-too-short.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0011-C-string-concat.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0012-perl-string-concat.txt +1 -0
- data/spec/acceptance/invalid/0005-strings/0013-Java-string-concat.txt +1 -0
- data/spec/acceptance/invalid/0006-encodings/0001-CESU-8.txt +1 -0
- data/spec/acceptance/invalid/0006-encodings/0002-Windows-31J.txt +1 -0
- data/spec/acceptance/invalid/0006-encodings/0003-EBCDIC.txt +1 -0
- data/spec/acceptance/invalid/0006-encodings/0004-overlong-utf8.txt +1 -0
- data/spec/acceptance/invalid/0007-arrays/0001-lacks-open.txt +1 -0
- data/spec/acceptance/invalid/0007-arrays/0002-lacks-close.txt +1 -0
- data/spec/acceptance/invalid/0007-arrays/0003-interleaving-parens.txt +1 -0
- data/spec/acceptance/invalid/0007-arrays/0004-dangling-comma.txt +1 -0
- data/spec/acceptance/invalid/0007-arrays/0005-missing-comma.txt +1 -0
- data/spec/acceptance/invalid/0007-arrays/0006-colon-instead-of-comma.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0001-key-missing.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0002-value-missing.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0003-true-key.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0004-false-key.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0005-null-key.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0006-numeric-key.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0007-array-key.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0008-hash-key.txt +1 -0
- data/spec/acceptance/invalid/0008-hashes/0009-key-not-escaped.txt +4 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0001-JSONP.txt +1 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0002-new-Array.txt +3 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0003-new-Date.txt +1 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0004-new-Error.txt +1 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0005-Math.txt +1 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0006-regular-expression.txt +1 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0007-function.txt +7 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0008-this.txt +1 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0009-plusplus.txt +3 -0
- data/spec/acceptance/invalid/0009-javascriptisms/0010-ternary-operator.txt +1 -0
- data/spec/acceptance/valid/0001-ws/0001-space.json +1 -0
- data/spec/acceptance/valid/0001-ws/0002-tab.json +1 -0
- data/spec/acceptance/valid/0001-ws/0003-lf.json +1 -0
- data/spec/acceptance/valid/0001-ws/0004-cr.json +1 -0
- data/spec/acceptance/valid/0001-ws/0005-before.json +1 -0
- data/spec/acceptance/valid/0001-ws/0006-after.json +1 -0
- data/spec/acceptance/valid/0001-ws/0007-around-comma.json +3 -0
- data/spec/acceptance/valid/0001-ws/0008-around-colon.json +3 -0
- data/spec/acceptance/valid/0002-bare-values/0001-false.json +1 -0
- data/spec/acceptance/valid/0002-bare-values/0002-null.json +1 -0
- data/spec/acceptance/valid/0002-bare-values/0003-true.json +1 -0
- data/spec/acceptance/valid/0002-bare-values/0004-number.json +1 -0
- data/spec/acceptance/valid/0002-bare-values/0005-string.json +1 -0
- data/spec/acceptance/valid/0003-literals/0001-false.json +1 -0
- data/spec/acceptance/valid/0003-literals/0002-null.json +1 -0
- data/spec/acceptance/valid/0003-literals/0003-true.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0000-zero.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0001-one.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0002-two.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0003-three.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0004-four.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0005-five.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0006-six.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0007-seven.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0008-eight.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0009-nine.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0010-ten.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0011-minus.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0012-fraction.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0013-exponent.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0014-exponent-minus.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0015-exponent-plus.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0016-complex.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0017-DBL_MAX.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0018-DBL_MIN.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0019-subnormal-number.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0020-1E400.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0021-pi.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0022-UINT32_MAX.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0023-UINT64_MAX.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0024-INT64_MIN.json +1 -0
- data/spec/acceptance/valid/0004-numbers/0025-high-resolution-zero.json +1 -0
- data/spec/acceptance/valid/0005-strings/0001-empty.json +1 -0
- data/spec/acceptance/valid/0005-strings/0002-basic-latin.json +1 -0
- data/spec/acceptance/valid/0005-strings/0003-escapes.json +1 -0
- data/spec/acceptance/valid/0005-strings/0004-raw-unicode.json +1 -0
- data/spec/acceptance/valid/0005-strings/0005-escaped-unicode.json +1 -0
- data/spec/acceptance/valid/0005-strings/0006-escaped-NUL.json +1 -0
- data/spec/acceptance/valid/0005-strings/0007-escaped-invalid-unicode-still-valid-as-json.json +1 -0
- data/spec/acceptance/valid/0005-strings/0008-ruby-json-gem-cant-handle-this.json +1 -0
- data/spec/acceptance/valid/0005-strings/0009-unescaped-invalid-javascript-still-valid-as-json.json +1 -0
- data/spec/acceptance/valid/0006-m17n/0001-genesis.json +6 -0
- data/spec/acceptance/valid/0006-m17n/0002-heart-sutra.json +5 -0
- data/spec/acceptance/valid/0006-m17n/0003-escaped-valid-surrogate-pair.json +1 -0
- data/spec/acceptance/valid/0006-m17n/0004-unescaped-valid-supplementary-multilingual-plane.json +1 -0
- data/spec/acceptance/valid/0007-arrays/0000-empty.json +1 -0
- data/spec/acceptance/valid/0007-arrays/0001-one-element.json +1 -0
- data/spec/acceptance/valid/0007-arrays/0002-multiple-elements.json +33 -0
- data/spec/acceptance/valid/0007-arrays/0003-various-types.json +1 -0
- data/spec/acceptance/valid/0007-arrays/0004-nested.json +17 -0
- data/spec/acceptance/valid/0008-hashes/0000-empty.json +1 -0
- data/spec/acceptance/valid/0008-hashes/0001-onekey.json +1 -0
- data/spec/acceptance/valid/0008-hashes/0002-many-keys.json +5 -0
- data/spec/acceptance/valid/0008-hashes/0003-empty-key.json +3 -0
- data/spec/acceptance/valid/0008-hashes/0004-true-value.json +3 -0
- data/spec/acceptance/valid/0008-hashes/0005-false-value.json +3 -0
- data/spec/acceptance/valid/0008-hashes/0006-null-value.json +3 -0
- data/spec/acceptance/valid/0008-hashes/0007-string-value.json +3 -0
- data/spec/acceptance/valid/0008-hashes/0008-numeric-value.json +3 -0
- data/spec/acceptance/valid/0008-hashes/0009-array-value.json +8 -0
- data/spec/acceptance/valid/0008-hashes/0010-hash-value.json +20 -0
- data/spec/acceptance/valid/0008-hashes/0011-duplicate-key-in-different-representations.json +4 -0
- data/spec/acceptance/valid/0008-hashes/0011-duplicate-key.json +4 -0
- data/spec/spec_helper.rb +54 -0
- metadata +520 -0
@@ -0,0 +1,267 @@
|
|
1
|
+
#! /your/favourite/path/to/racc
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# Copyright (c) 2014 Urabe, Shyouhei. All rights reserved.
|
5
|
+
#
|
6
|
+
# Redistribution and use in source and binary forms, with or without
|
7
|
+
# modification, are permitted provided that the following conditions are met:
|
8
|
+
#
|
9
|
+
# - Redistributions of source code must retain the above copyright
|
10
|
+
# notice, this list of conditions and the following disclaimer.
|
11
|
+
#
|
12
|
+
# - Redistributions in binary form must reproduce the above copyright
|
13
|
+
# notice, this list of conditions and the following disclaimer in
|
14
|
+
# the documentation and/or other materials provided with the
|
15
|
+
# distribution.
|
16
|
+
#
|
17
|
+
# - Neither the name of Internet Society, IETF or IETF Trust, nor the
|
18
|
+
# names of specific contributors, may be used to endorse or promote
|
19
|
+
# products derived from this software without specific prior written
|
20
|
+
# permission.
|
21
|
+
#
|
22
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
|
23
|
+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
24
|
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
25
|
+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
26
|
+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
27
|
+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
28
|
+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
29
|
+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
30
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
31
|
+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
32
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
33
|
+
|
34
|
+
# This is almost one-to-one translation of RFC7159 section 2 through 7, from
|
35
|
+
# Augmented BNF to Racc BNF. Should be the easiest to verify implementation
|
36
|
+
# against the spec.
|
37
|
+
#
|
38
|
+
# @note This parser has several shift/reduct conflicts. They are all around
|
39
|
+
# handling of white spaces (called "ws"), so can silently be ignored. I also
|
40
|
+
# checked the parser internal and made sure they are OK.
|
41
|
+
class RFC7159::Parser
|
42
|
+
|
43
|
+
options no_result_var
|
44
|
+
expect 28
|
45
|
+
rule
|
46
|
+
|
47
|
+
# Notes about nonterminal's names: in order to make manual verification
|
48
|
+
# easy, all the nonterminals that appear in the RFC are named as such. ABNF
|
49
|
+
# is much concise than plain BNF, so here we added several helper
|
50
|
+
# nonterminals; they are prefixed with "__" so you can distinguish if a
|
51
|
+
# nonterminal is RFC-origin or not.
|
52
|
+
|
53
|
+
# RFC7159 section 2
|
54
|
+
|
55
|
+
JSON_text : ws value ws { val[1] }
|
56
|
+
begin_array : ws "\x5B" ws # [ left square bracket
|
57
|
+
begin_object : ws "\x7B" ws # { left curly bracket
|
58
|
+
end_array : ws "\x5D" ws # ] right square bracket
|
59
|
+
end_object : ws "\x7D" ws # } right curly bracket
|
60
|
+
name_separator : ws "\x3A" ws # : colon
|
61
|
+
value_separator : ws "\x2C" ws # , comma
|
62
|
+
ws : # <- this is the '*' in the ABNF
|
63
|
+
| ws "\x20" # Space
|
64
|
+
| ws "\x09" # Horizontal tab
|
65
|
+
| ws "\x0A" # Line feed or New line
|
66
|
+
| ws "\x0D" # Carriage return
|
67
|
+
|
68
|
+
# RFC7159 section 3
|
69
|
+
|
70
|
+
value : false | null | true | object | array | number | string
|
71
|
+
false : "\x66" "\x61" "\x6c" "\x73" "\x65" { [ :false ] } # false
|
72
|
+
null : "\x6e" "\x75" "\x6c" "\x6c" { [ :null ] } # null
|
73
|
+
true : "\x74" "\x72" "\x75" "\x65" { [ :true ] } # true
|
74
|
+
|
75
|
+
# RFC7159 section 4
|
76
|
+
|
77
|
+
object : begin_object end_object { [ :object ] }
|
78
|
+
| begin_object __members__ end_object { [ :object, *val[1] ] }
|
79
|
+
__members__ : member { val }
|
80
|
+
| __members__ value_separator member { [ *val[0], val[2] ] }
|
81
|
+
member : string name_separator value { [ val[0], val[2] ] }
|
82
|
+
|
83
|
+
# RFC7159 section 5
|
84
|
+
|
85
|
+
array : begin_array end_array { [ :array ] }
|
86
|
+
| begin_array __list__ end_array { [ :array, *val[1] ] }
|
87
|
+
__list__ : value { val }
|
88
|
+
| __list__ value_separator value { [ *val[0], val[2] ] }
|
89
|
+
|
90
|
+
# RFC7159 section 6
|
91
|
+
|
92
|
+
number : __minus_p__ int __frac_p__ __exp_p__ { [ :number, *val ] }
|
93
|
+
__minus_p__ : | minus
|
94
|
+
__frac_p__ : | frac
|
95
|
+
__exp_p__ : | exp
|
96
|
+
decimal_point : "\x2E" # .
|
97
|
+
digit1_9 : "\x31" | "\x32" | "\x33" | "\x34" | "\x35"
|
98
|
+
| "\x36" | "\x37" | "\x38" | "\x39"
|
99
|
+
e : "\x65" | "\x45" # e E
|
100
|
+
exp : e __sign__ __digit_plus__ { val }
|
101
|
+
frac : decimal_point __digit_plus__ { val }
|
102
|
+
int : zero { val }
|
103
|
+
| digit1_9 { val }
|
104
|
+
| digit1_9 __digit_plus__ { [ val[0], *val[1] ] }
|
105
|
+
minus : "\x2D" # -
|
106
|
+
plus : "\x2B" # +
|
107
|
+
zero : "\x30" # 0
|
108
|
+
DIGIT : zero | digit1_9
|
109
|
+
__sign__ : | plus | minus
|
110
|
+
__digit_plus__ : DIGIT { val }
|
111
|
+
| __digit_plus__ DIGIT { [ *val[0], val[1] ] }
|
112
|
+
|
113
|
+
# RFC7159 section 7
|
114
|
+
|
115
|
+
string : quotation_mark quotation_mark { [ :string ] }
|
116
|
+
| quotation_mark __chars__ quotation_mark { [ :string, *val[1] ] }
|
117
|
+
__chars__ : char { val }
|
118
|
+
| __chars__ char { [ *val[0], val[1] ] }
|
119
|
+
char : unescaped | escape __ctrl__ { val }
|
120
|
+
__ctrl__ : "\x22" # " quotation mark U+0022
|
121
|
+
| "\x5C" # \ reverse solidus U+005C
|
122
|
+
| "\x2F" # / solidus U+002F
|
123
|
+
| "\x62" # b backspace U+0008
|
124
|
+
| "\x66" # f form feed U+000C
|
125
|
+
| "\x6E" # n line feed U+000A
|
126
|
+
| "\x72" # r carriage return U+000D
|
127
|
+
| "\x74" # t tab U+0009
|
128
|
+
| "\x75" # uXXXX U+XXXX
|
129
|
+
HEXDIG HEXDIG HEXDIG HEXDIG { val }
|
130
|
+
escape : "\x5C" # \
|
131
|
+
quotation_mark : "\x22" # "
|
132
|
+
HEXDIG : DIGIT
|
133
|
+
| "\x61" | "\x62" | "\x63" | "\x64" | "\x65" | "\x66"
|
134
|
+
| "\x41" | "\x42" | "\x43" | "\x44" | "\x45" | "\x46"
|
135
|
+
|
136
|
+
# "unescaped" is too much to list up here; use lexer instead.
|
137
|
+
# unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
|
138
|
+
end
|
139
|
+
|
140
|
+
---- inner
|
141
|
+
|
142
|
+
# @param [true, false] accept_bom Whether to accept BOMs
|
143
|
+
# @param [true, false] yydebug Whether to enable debug mode
|
144
|
+
def initialize accept_bom: false, yydebug: false
|
145
|
+
@accept_bom = accept_bom
|
146
|
+
@yydebug = yydebug
|
147
|
+
end
|
148
|
+
|
149
|
+
# Parses str and generates AST. The str must consist of _a_ valid JSON
|
150
|
+
# text, otherwise an exception shall raise.
|
151
|
+
#
|
152
|
+
# @param [#each_char] str IO or String or something to parse
|
153
|
+
# @return [::Array] Parsed AST
|
154
|
+
# @raise [Racc::ParseError] The input is invalid
|
155
|
+
# @raise [Encoding::CompatibilityError] The input is invalid
|
156
|
+
def parse str
|
157
|
+
@state = :init
|
158
|
+
@enum = str.enum_for:each_char
|
159
|
+
firstchar = @enum.peek
|
160
|
+
@lineno = 1
|
161
|
+
@column = 1
|
162
|
+
|
163
|
+
case enc = firstchar.encoding
|
164
|
+
when Encoding::UTF_8,
|
165
|
+
Encoding::US_ASCII, # true subset of UTF-8
|
166
|
+
Encoding::UTF8_MAC, # true subset of UTF-8
|
167
|
+
Encoding::UTF_16LE,
|
168
|
+
Encoding::UTF_16BE,
|
169
|
+
Encoding::UTF_32LE,
|
170
|
+
Encoding::UTF_32BE
|
171
|
+
# RFC7159 sectoin 8.1 explicitly states that the input string must be
|
172
|
+
# either UTF 8, 16, or 32 -encoded. That point is as clear as the
|
173
|
+
# sky. All other encodings are NG. However, what we call the ASCII
|
174
|
+
# encoding is the true subset of UTF-8. A string of ASCII must also
|
175
|
+
# be valid as UTF-8. So we allow this.
|
176
|
+
#
|
177
|
+
# There are disucssions about parsing BOMs. The original RFC4627 said
|
178
|
+
# nothing about BOMs, however its section 3 ("Encoding") cannot be
|
179
|
+
# read as if it expected BOMs. Current RFC7159 _prohibits_ to
|
180
|
+
# generate JSON texts with BOMs but _allows_ to accept.
|
181
|
+
#
|
182
|
+
# This parser can control whether to accept BOMs.
|
183
|
+
if @accept_bom and firstchar == "\u{feff}".encode(enc)
|
184
|
+
@enum.next # consume
|
185
|
+
end
|
186
|
+
return do_parse
|
187
|
+
else
|
188
|
+
raise Encoding::CompatibilityError, <<-"end".gsub(/[\n\s]+/, ' ')
|
189
|
+
``JSON text SHALL be encoded in UTF-8, UTF-16, or UTF-32'', said
|
190
|
+
RFC7159 section 8.1. The given string is NOT in any of those
|
191
|
+
encodings (but #{enc.inspect}).
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
private
|
197
|
+
def next_token
|
198
|
+
chr = @enum.next
|
199
|
+
tok = chr # dfault
|
200
|
+
newline, @newline = @newline, /[\r\n]/.match(chr)
|
201
|
+
if newline
|
202
|
+
@lineno += 1
|
203
|
+
@column = 1
|
204
|
+
else
|
205
|
+
@column += 1
|
206
|
+
end
|
207
|
+
case @state
|
208
|
+
when :string then # recap: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
|
209
|
+
case chr.ord
|
210
|
+
when 0x20..0x21 then tok = :unescaped
|
211
|
+
when 0x22 then @state = :init # "
|
212
|
+
when 0x23..0x5B then tok = :unescaped
|
213
|
+
when 0x5C then @state = :escaped # \
|
214
|
+
when 0x5D..0x10FFFF then tok = :unescaped
|
215
|
+
else @state = :string # NG unicode
|
216
|
+
end
|
217
|
+
when :init then @state = (chr == '"') ? :string : :init
|
218
|
+
when :escaped then @state = (chr == 'u') ? :u1 : :string
|
219
|
+
when :u1 then @state = :u2
|
220
|
+
when :u2 then @state = :u3
|
221
|
+
when :u3 then @state = :u4
|
222
|
+
when :u4 then @state = :string
|
223
|
+
end
|
224
|
+
return tok, chr
|
225
|
+
rescue StopIteration
|
226
|
+
return false, @enum
|
227
|
+
end
|
228
|
+
|
229
|
+
def on_error id, val, stack
|
230
|
+
reason = case @state
|
231
|
+
when :string
|
232
|
+
'this character is not allowed in a string; escape it.'
|
233
|
+
when :u1, :u2, :u3, :u4
|
234
|
+
'\uXXXX must exactly be a four-letter hexadecimal sequence.'
|
235
|
+
else
|
236
|
+
case val
|
237
|
+
when "'"
|
238
|
+
'you must use " to quote strings'
|
239
|
+
when '}', ']', ','
|
240
|
+
'possible extra (dangling) comma?'
|
241
|
+
when ':'
|
242
|
+
'possible confusion of {} vs []?'
|
243
|
+
when /\s/
|
244
|
+
'possible space inside of a number?'
|
245
|
+
when /\d/
|
246
|
+
'possible lack of +/- in exponent?'
|
247
|
+
else
|
248
|
+
'unexpected character'
|
249
|
+
end
|
250
|
+
end
|
251
|
+
msg = sprintf 'Syntax error near line %d, char %d (%p): %s',
|
252
|
+
@lineno, @column, val, reason
|
253
|
+
raise Racc::ParseError, msg
|
254
|
+
end
|
255
|
+
|
256
|
+
---- footer
|
257
|
+
|
258
|
+
#
|
259
|
+
# Local Variables:
|
260
|
+
# mode: ruby
|
261
|
+
# coding: utf-8-unix
|
262
|
+
# indent-tabs-mode: t
|
263
|
+
# tab-width: 3
|
264
|
+
# ruby-indent-level: 3
|
265
|
+
# fill-column: 79
|
266
|
+
# default-justification: full
|
267
|
+
# End:
|
@@ -0,0 +1,221 @@
|
|
1
|
+
#! /your/favourite/path/to/ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# Copyright (c) 2014 Urabe, Shyouhei. All rights reserved.
|
5
|
+
#
|
6
|
+
# Redistribution and use in source and binary forms, with or without
|
7
|
+
# modification, are permitted provided that the following conditions are met:
|
8
|
+
#
|
9
|
+
# - Redistributions of source code must retain the above copyright
|
10
|
+
# notice, this list of conditions and the following disclaimer.
|
11
|
+
#
|
12
|
+
# - Redistributions in binary form must reproduce the above copyright
|
13
|
+
# notice, this list of conditions and the following disclaimer in
|
14
|
+
# the documentation and/or other materials provided with the
|
15
|
+
# distribution.
|
16
|
+
#
|
17
|
+
# - Neither the name of Internet Society, IETF or IETF Trust, nor the
|
18
|
+
# names of specific contributors, may be used to endorse or promote
|
19
|
+
# products derived from this software without specific prior written
|
20
|
+
# permission.
|
21
|
+
#
|
22
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
|
23
|
+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
24
|
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
25
|
+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
26
|
+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
27
|
+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
28
|
+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
29
|
+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
30
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
31
|
+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
32
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
33
|
+
|
34
|
+
# The Strings, as described in RFC7159 section 7.
|
35
|
+
class RFC7159::String < RFC7159::Value
|
36
|
+
# Parse the AST from parser, and convert into corrsponding values.
|
37
|
+
# @param [::Array] ast the AST, generated by the parser
|
38
|
+
# @return [String] evaluated instance
|
39
|
+
# @raise [ArgumentError] malformed input
|
40
|
+
def self.from_ast ast
|
41
|
+
type, *ary = *ast
|
42
|
+
raise ArgumentError, "not an object: #{ast.inspect}" if type != :string
|
43
|
+
new ary
|
44
|
+
end
|
45
|
+
|
46
|
+
# @return [::String] converte string
|
47
|
+
def plain_old_ruby_object
|
48
|
+
return @str
|
49
|
+
end
|
50
|
+
|
51
|
+
alias to_s plain_old_ruby_object
|
52
|
+
alias to_str plain_old_ruby_object
|
53
|
+
|
54
|
+
# @return [::String] the string, escaped
|
55
|
+
def inspect
|
56
|
+
sprintf "#<%p:%#016x %p>", self.class, self.object_id << 1, @str
|
57
|
+
end
|
58
|
+
|
59
|
+
# For pretty print
|
60
|
+
# @param [PP] pp the pp
|
61
|
+
def pretty_print pp
|
62
|
+
hdr = sprintf '#<%p:%#016x', self.class, self.object_id << 1
|
63
|
+
pp.group 1, hdr, '>' do
|
64
|
+
pp.breakable
|
65
|
+
@str.pretty_print pp
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# @return [string] original string
|
70
|
+
def to_json *;
|
71
|
+
'"' << @orig.flatten.join('') << '"'
|
72
|
+
end
|
73
|
+
|
74
|
+
# String comparisons are defined in RFC7159 section 8.3. We follow that.
|
75
|
+
def == other
|
76
|
+
self.to_str == other.to_str
|
77
|
+
rescue NoMethodError
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
private_class_method:new
|
83
|
+
# @private
|
84
|
+
def initialize ary
|
85
|
+
@orig = ary
|
86
|
+
enc = ary[0][0].encoding rescue Encoding::US_ASCII # empty string
|
87
|
+
path1 = ary.map do |i|
|
88
|
+
case i when Array
|
89
|
+
# ['\\', ['u', 'F', 'F', 'E', 'E']] or something
|
90
|
+
case i[1]
|
91
|
+
when "\x22" then 0x0022 # " quotation mark U+0022
|
92
|
+
when "\x5C" then 0x005C # \ reverse solidus U+005C
|
93
|
+
when "\x2F" then 0x002F # / solidus U+002F
|
94
|
+
when "\x62" then 0x0008 # b backspace U+0008
|
95
|
+
when "\x66" then 0x000C # f form feed U+000C
|
96
|
+
when "\x6E" then 0x000A # n line feed U+000A
|
97
|
+
when "\x72" then 0x000D # r carriage return U+000D
|
98
|
+
when "\x74" then 0x0009 # t tab U+0009
|
99
|
+
else # uXXXX U+XXXX
|
100
|
+
i[1][1..4].join.to_i 16
|
101
|
+
end
|
102
|
+
else
|
103
|
+
i.ord
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# RFC7159 section 8.1 states that the JSON text itself shall be written
|
108
|
+
# in a sort of Unicode. However the parsed JSON value's content strings
|
109
|
+
# are not always Unicode-valid, according to its section 8.2. Then what?
|
110
|
+
# It says nothing. Here, we try to preserve the JSON text's encoding
|
111
|
+
# i.e. if the JSON text is in UTF-16, we try UTF-16. If that doesn't
|
112
|
+
# fit, we give up and take BINARY.
|
113
|
+
buf = nil
|
114
|
+
path2 = path1.each_with_object Array.new do |i, r|
|
115
|
+
if buf.nil?
|
116
|
+
next buf = i
|
117
|
+
else
|
118
|
+
case buf when 0xD800..0xDBFF
|
119
|
+
case i when 0xDC00..0xDFFF
|
120
|
+
# valid surrogate pair
|
121
|
+
utf16str = [buf, i].pack 'nn'
|
122
|
+
utf16str.force_encoding Encoding::UTF_16BE
|
123
|
+
r << utf16str[0].ord
|
124
|
+
buf = nil # consumed
|
125
|
+
else
|
126
|
+
# buf is a garbage
|
127
|
+
r << buf
|
128
|
+
buf = i
|
129
|
+
end
|
130
|
+
else
|
131
|
+
# buf is a normal char
|
132
|
+
r << buf
|
133
|
+
buf = i
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
path2 << buf if buf # buf might remain
|
138
|
+
|
139
|
+
path3 = path2.each_with_object ''.b do |i, r|
|
140
|
+
case enc
|
141
|
+
when Encoding::UTF_32BE then j = [i].pack 'N'
|
142
|
+
when Encoding::UTF_32LE then j = [i].pack 'V'
|
143
|
+
when Encoding::UTF_16BE then j = [i].pack 'n'
|
144
|
+
when Encoding::UTF_16LE then j = [i].pack 'v'
|
145
|
+
else j = [i].pack 'U' # sort of UTF-8
|
146
|
+
end
|
147
|
+
r << j.b
|
148
|
+
end
|
149
|
+
path4 = path3.dup.force_encoding enc
|
150
|
+
# @str = path4.valid_encoding? ? path4 : path3
|
151
|
+
@str = path4
|
152
|
+
@str.freeze
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Dialogue about evaluating JSON's string
|
158
|
+
# ----
|
159
|
+
# 2014.03.17.txt:20:50:01 >#ruby-ja@ircnet:shyouhei < JSONのRFC、文字列が"\uDEAD"とかなっててもvalidだよって書いてあるけど、
|
160
|
+
# 2014.03.17.txt:20:50:14 >#ruby-ja@ircnet:shyouhei < それはいいのだが
|
161
|
+
# 2014.03.17.txt:20:50:32 >#ruby-ja@ircnet:shyouhei < たとえばそのJSONがUTF-16で書かれているとして
|
162
|
+
# 2014.03.17.txt:20:50:59 >#ruby-ja@ircnet:shyouhei < UTF-16の"\uDEAD"的なのをRubyで作ろうと思うとなかなかむずかしいな
|
163
|
+
# 2014.03.17.txt:20:51:55 >#ruby-ja@ircnet:shyouhei < "\\uDEAD"という文字列(ただしUTF-16)を入力したら"\u{DEAD}"という文字列(ただしUTF-16)を出力する関数
|
164
|
+
# 2014.03.17.txt:20:52:08 >#ruby-ja@ircnet:shyouhei < むずい。
|
165
|
+
# 2014.03.17.txt:20:52:09 <#ruby-ja@ircnet:nurse > "\xDE\xAD".force_encoding("utf-16be")とかになっちゃいますなぁ
|
166
|
+
# 2014.03.17.txt:20:52:34 <#ruby-ja@ircnet:nurse > [0xDEAD].pack("n").force_encoding("utf-16be")のが素直かな
|
167
|
+
# 2014.03.17.txt:20:53:35 >#ruby-ja@ircnet:shyouhei < なんか実務上はそこまでがんばるより例外で死んだ方がしあわせになれそうではある
|
168
|
+
# 2014.03.17.txt:20:54:00 >#ruby-ja@ircnet:shyouhei < 誰も幸せにしなさそう
|
169
|
+
# 2014.03.17.txt:20:54:26 <#ruby-ja@ircnet:nurse > 死んじゃダメで、ゲタにするのが正解じゃないっけ
|
170
|
+
# 2014.03.17.txt:20:54:54 >#ruby-ja@ircnet:shyouhei < それがより正しそうですね
|
171
|
+
# 2014.03.17.txt:20:55:56 >#ruby-ja@ircnet:shyouhei < JSONはサロゲートペアもなんとかせねばならんので面倒そうだ
|
172
|
+
# 2014.03.17.txt:20:57:06 >#ruby-ja@ircnet:shyouhei < (\uXYZW が単体でNGぽいくても次にサロゲートペアが続くかもしれん)
|
173
|
+
# 2014.03.17.txt:20:57:37 >#ruby-ja@ircnet:shyouhei < めんどう!
|
174
|
+
# 2014.03.17.txt:20:57:42 >#ruby-ja@ircnet:shyouhei < UTF16しねばいいのに
|
175
|
+
# 2014.03.17.txt:20:59:06 <#ruby-ja@ircnet:nurse > とりあえずそのままUTF-16にしてみて、encodeでinvalid replaceすればいい気がする
|
176
|
+
# 2014.03.17.txt:21:00:33 >#ruby-ja@ircnet:shyouhei < すでにUTF16な文字列にサロゲートペアの片割れ的なバイナリをがしょがしょって後ろから足してからencodeするとよしなにする?
|
177
|
+
# 2014.03.17.txt:21:01:13 >#ruby-ja@ircnet:shyouhei < (頭の悪い発言なのは自覚しております)
|
178
|
+
# 2014.03.17.txt:21:01:29 <#ruby-ja@ircnet:nurse > invalid: :replaceつけてUTF-8にするなり、UTF-16のままscrubすれば
|
179
|
+
# 2014.03.17.txt:21:02:45 >#ruby-ja@ircnet:shyouhei < invalidなのはよいとして "\uFOO\uBAR" てきなサロゲートペアてきJSON文字列をちゃんとRuby的に(正しいUTF16文字列)に復元するシナリオ
|
180
|
+
# 2014.03.17.txt:21:03:46 <#ruby-ja@ircnet:nurse > たぶんAScii-8BITで足さないとエラーになる気がする
|
181
|
+
# 2014.03.17.txt:21:04:05 <#ruby-ja@ircnet:nurse > そこいがいは、無心につなげて、最後にencodeまたはscrubが正解ではないかと
|
182
|
+
# 2014.03.17.txt:21:04:13 >#ruby-ja@ircnet:shyouhei < あきらめて全部バイナリと思ってくっつけておいてから最後にencodeか
|
183
|
+
# 2014.03.17.txt:21:05:20 <#ruby-ja@ircnet:nurse > ASCII-8BITだと文字列のvalidチェックしない分速いし。
|
184
|
+
# 2014.03.17.txt:21:06:33 >#ruby-ja@ircnet:shyouhei < 世の中のJSONパーザがUTF16サポートしないという姿勢にはそれなりの理由があることがわかった。
|
185
|
+
# 2014.03.17.txt:21:07:17 <#ruby-ja@ircnet:nurse > そもそもHTTPで文字列流すのにASCII非互換ってのが邪悪である
|
186
|
+
# 2014.03.17.txt:21:15:04 <#ruby-ja@ircnet:nurse > 例のOpenBSDのsignifyをportableにしたらRubyでも使えるかなぁ
|
187
|
+
# 2014.03.17.txt:21:18:39 <#ruby-ja@ircnet:nurse > ていうか卜部さんはJSONパーサでも書いてるのかしら
|
188
|
+
# 2014.03.17.txt:21:18:56 <#ruby-ja@ircnet:nurse > って、聞いちゃいけない質問な気がした
|
189
|
+
# ----
|
190
|
+
# 2014.03.25.txt:16:08:14 >#ruby-ja@ircnet:shyouhei < "\u{dead}" を入力されたときに "\\uDEAD" を出力する関数を作成せよ
|
191
|
+
# 2014.03.25.txt:16:09:21 >#ruby-ja@ircnet:shyouhei < str.force_encoding('utf-8').scrub {|c| "\\u" + c.unpack('H*") } はだめぽい
|
192
|
+
# 2014.03.25.txt:16:14:13 >#ruby-ja@ircnet:shyouhei < primitive_convertでなんとかなるのかこれ
|
193
|
+
# 2014.03.25.txt:16:20:10 <#ruby-ja@ircnet:n0kada > "\u{dead}"ってinvalidなんだっけ
|
194
|
+
# 2014.03.25.txt:16:22:29 >#ruby-ja@ircnet:shyouhei < サロゲートペアのかたほう
|
195
|
+
# 2014.03.25.txt:16:22:44 >#ruby-ja@ircnet:shyouhei < それだけではinvalidすね
|
196
|
+
# 2014.03.25.txt:16:34:47 >#ruby-ja@ircnet:shyouhei < お、"\u{dead}".unpack('U*')で0xdeadが取得できる
|
197
|
+
# 2014.03.25.txt:16:34:57 >#ruby-ja@ircnet:shyouhei < ここからなんとかすればいいのか…?
|
198
|
+
# 2014.03.25.txt:16:35:00 >#ruby-ja@ircnet:shyouhei < しかしどうする
|
199
|
+
# 2014.03.25.txt:16:35:08 <#ruby-ja@ircnet:akr > "\u{dead}".unpack("U*").map {|c| 0xD800 <= c && c <= 0xDFFF ? "\\u%04X" % c : [c].pack("U") }.join
|
200
|
+
# 2014.03.25.txt:16:38:16 >#ruby-ja@ircnet:shyouhei < おお。
|
201
|
+
# 2014.03.25.txt:16:38:46 >#ruby-ja@ircnet:shyouhei < scrubでなんとかするのは筋が悪いことが分かりつつある
|
202
|
+
# 2014.03.25.txt:16:39:36 >#ruby-ja@ircnet:shyouhei < まずは文字列じゃなくてコードポイントの配列にして、そこでごにょってから、さいごに文字列になおすのが色々正しい雰囲気を感じる
|
203
|
+
# 2014.03.25.txt:16:39:53 <#ruby-ja@ircnet:akr > encoding が壊れている時に、文字の範囲を確定するのは難しいので。
|
204
|
+
# 2014.03.25.txt:16:43:08 <#ruby-ja@ircnet:n0kada > unpackはサロゲートペアの片割れも扱える仕様なんだっけ
|
205
|
+
# 2014.03.25.txt:16:43:41 <#ruby-ja@ircnet:akr > 仕様かどうかは知らない
|
206
|
+
# 2014.03.25.txt:16:44:36 <#ruby-ja@ircnet:akr > 伝統的に寛大だったとは思う
|
207
|
+
# 2014.03.25.txt:16:45:41 (#ruby-ja@ircnet:n0kada ) $ grep -r surrogate spec/rubyspec/core/string/unpack/
|
208
|
+
# 2014.03.25.txt:16:45:42 (#ruby-ja@ircnet:n0kada ) bash: exit 1
|
209
|
+
# 2014.03.25.txt:16:46:06 <#ruby-ja@ircnet:n0kada > rubyspecが持ってないとは意外だな
|
210
|
+
# 2014.03.25.txt:16:46:18 <#ruby-ja@ircnet:n0kada > こういう重箱の隅はお得意だろうに
|
211
|
+
|
212
|
+
#
|
213
|
+
# Local Variables:
|
214
|
+
# mode: ruby
|
215
|
+
# coding: utf-8-unix
|
216
|
+
# indent-tabs-mode: t
|
217
|
+
# tab-width: 3
|
218
|
+
# ruby-indent-level: 3
|
219
|
+
# fill-column: 79
|
220
|
+
# default-justification: full
|
221
|
+
# End:
|