RFC7159 7159

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +61 -0
  3. data/.rspec +4 -0
  4. data/.yardopts +2 -0
  5. data/Gemfile +46 -0
  6. data/LICENSE.txt +40 -0
  7. data/README.txt +899 -0
  8. data/RFC7159 +899 -0
  9. data/RFC7159.gemspec +85 -0
  10. data/Rakefile +82 -0
  11. data/lib/RFC7159.rb +80 -0
  12. data/lib/RFC7159/array.rb +134 -0
  13. data/lib/RFC7159/dumper.rb +231 -0
  14. data/lib/RFC7159/false.rb +79 -0
  15. data/lib/RFC7159/null.rb +79 -0
  16. data/lib/RFC7159/number.rb +149 -0
  17. data/lib/RFC7159/object.rb +140 -0
  18. data/lib/RFC7159/parser.ry +267 -0
  19. data/lib/RFC7159/string.rb +221 -0
  20. data/lib/RFC7159/true.rb +79 -0
  21. data/lib/RFC7159/value.rb +96 -0
  22. data/lib/RFC7159/version.rb +48 -0
  23. data/spec/RFC7159_spec.rb +275 -0
  24. data/spec/acceptance/README.txt +66 -0
  25. data/spec/acceptance/invalid/0001-ws/0001-verical-tab.txt +1 -0
  26. data/spec/acceptance/invalid/0001-ws/0002-null.txt +0 -0
  27. data/spec/acceptance/invalid/0001-ws/0003-space-in-number-1.txt +1 -0
  28. data/spec/acceptance/invalid/0001-ws/0004-space-in-number-2.txt +1 -0
  29. data/spec/acceptance/invalid/0001-ws/0005-space-in-number-3.txt +1 -0
  30. data/spec/acceptance/invalid/0001-ws/0006-space-in-number-4.txt +1 -0
  31. data/spec/acceptance/invalid/0001-ws/0007-space-in-number-5.txt +1 -0
  32. data/spec/acceptance/invalid/0001-ws/0008-space-in-number-6.txt +1 -0
  33. data/spec/acceptance/invalid/0001-ws/0009-space-in-literal.txt +1 -0
  34. data/spec/acceptance/invalid/0001-ws/0010-Unicode-LINE_SEPARATOR.txt +1 -0
  35. data/spec/acceptance/invalid/0002-comments/0001-C-style.txt +1 -0
  36. data/spec/acceptance/invalid/0002-comments/0002-C-plusplus-style.txt +2 -0
  37. data/spec/acceptance/invalid/0002-comments/0003-sh-style.txt +2 -0
  38. data/spec/acceptance/invalid/0002-comments/0004-python-docstring.txt +3 -0
  39. data/spec/acceptance/invalid/0002-comments/0005-SQL-style.txt +2 -0
  40. data/spec/acceptance/invalid/0002-comments/0006-BASIC-style.txt +2 -0
  41. data/spec/acceptance/invalid/0003-literals/0001-FALSE.txt +1 -0
  42. data/spec/acceptance/invalid/0003-literals/0002-NULL.txt +1 -0
  43. data/spec/acceptance/invalid/0003-literals/0003-TRUE.txt +1 -0
  44. data/spec/acceptance/invalid/0003-literals/0004-NUL.txt +1 -0
  45. data/spec/acceptance/invalid/0003-literals/0005-nil.txt +1 -0
  46. data/spec/acceptance/invalid/0003-literals/0006-undef.txt +1 -0
  47. data/spec/acceptance/invalid/0003-literals/0007-modifier.txt +1 -0
  48. data/spec/acceptance/invalid/0003-literals/0008-undefined.txt +1 -0
  49. data/spec/acceptance/invalid/0004-numbers/0001-omit-zero.txt +1 -0
  50. data/spec/acceptance/invalid/0004-numbers/0002-minus-dot.txt +1 -0
  51. data/spec/acceptance/invalid/0004-numbers/0003-missing-frac.txt +1 -0
  52. data/spec/acceptance/invalid/0004-numbers/0004-missing-exp.txt +1 -0
  53. data/spec/acceptance/invalid/0004-numbers/0005-octal.txt +1 -0
  54. data/spec/acceptance/invalid/0004-numbers/0006-hexadecimal.txt +1 -0
  55. data/spec/acceptance/invalid/0004-numbers/0007-comma.txt +1 -0
  56. data/spec/acceptance/invalid/0004-numbers/0008-perl-underscore.txt +1 -0
  57. data/spec/acceptance/invalid/0004-numbers/0009-NaN.txt +1 -0
  58. data/spec/acceptance/invalid/0004-numbers/0010-Inf.txt +1 -0
  59. data/spec/acceptance/invalid/0004-numbers/0011-Infinity.txt +1 -0
  60. data/spec/acceptance/invalid/0005-strings/0000-NUL.txt +0 -0
  61. data/spec/acceptance/invalid/0005-strings/0001-not-terminated.txt +1 -0
  62. data/spec/acceptance/invalid/0005-strings/0002-single-quote.txt +1 -0
  63. data/spec/acceptance/invalid/0005-strings/0003-back-quote.txt +1 -0
  64. data/spec/acceptance/invalid/0005-strings/0004-carriage-return.txt +1 -0
  65. data/spec/acceptance/invalid/0005-strings/0005-line-feed.txt +2 -0
  66. data/spec/acceptance/invalid/0005-strings/0006-unknown-escape-a.txt +1 -0
  67. data/spec/acceptance/invalid/0005-strings/0007-unknown-escape-perl-style.txt +1 -0
  68. data/spec/acceptance/invalid/0005-strings/0008-unknown-escape-C-style.txt +1 -0
  69. data/spec/acceptance/invalid/0005-strings/0009-unknown-escape-ruby-style.txt +1 -0
  70. data/spec/acceptance/invalid/0005-strings/0010-escape-too-short.txt +1 -0
  71. data/spec/acceptance/invalid/0005-strings/0011-C-string-concat.txt +1 -0
  72. data/spec/acceptance/invalid/0005-strings/0012-perl-string-concat.txt +1 -0
  73. data/spec/acceptance/invalid/0005-strings/0013-Java-string-concat.txt +1 -0
  74. data/spec/acceptance/invalid/0006-encodings/0001-CESU-8.txt +1 -0
  75. data/spec/acceptance/invalid/0006-encodings/0002-Windows-31J.txt +1 -0
  76. data/spec/acceptance/invalid/0006-encodings/0003-EBCDIC.txt +1 -0
  77. data/spec/acceptance/invalid/0006-encodings/0004-overlong-utf8.txt +1 -0
  78. data/spec/acceptance/invalid/0007-arrays/0001-lacks-open.txt +1 -0
  79. data/spec/acceptance/invalid/0007-arrays/0002-lacks-close.txt +1 -0
  80. data/spec/acceptance/invalid/0007-arrays/0003-interleaving-parens.txt +1 -0
  81. data/spec/acceptance/invalid/0007-arrays/0004-dangling-comma.txt +1 -0
  82. data/spec/acceptance/invalid/0007-arrays/0005-missing-comma.txt +1 -0
  83. data/spec/acceptance/invalid/0007-arrays/0006-colon-instead-of-comma.txt +1 -0
  84. data/spec/acceptance/invalid/0008-hashes/0001-key-missing.txt +1 -0
  85. data/spec/acceptance/invalid/0008-hashes/0002-value-missing.txt +1 -0
  86. data/spec/acceptance/invalid/0008-hashes/0003-true-key.txt +1 -0
  87. data/spec/acceptance/invalid/0008-hashes/0004-false-key.txt +1 -0
  88. data/spec/acceptance/invalid/0008-hashes/0005-null-key.txt +1 -0
  89. data/spec/acceptance/invalid/0008-hashes/0006-numeric-key.txt +1 -0
  90. data/spec/acceptance/invalid/0008-hashes/0007-array-key.txt +1 -0
  91. data/spec/acceptance/invalid/0008-hashes/0008-hash-key.txt +1 -0
  92. data/spec/acceptance/invalid/0008-hashes/0009-key-not-escaped.txt +4 -0
  93. data/spec/acceptance/invalid/0009-javascriptisms/0001-JSONP.txt +1 -0
  94. data/spec/acceptance/invalid/0009-javascriptisms/0002-new-Array.txt +3 -0
  95. data/spec/acceptance/invalid/0009-javascriptisms/0003-new-Date.txt +1 -0
  96. data/spec/acceptance/invalid/0009-javascriptisms/0004-new-Error.txt +1 -0
  97. data/spec/acceptance/invalid/0009-javascriptisms/0005-Math.txt +1 -0
  98. data/spec/acceptance/invalid/0009-javascriptisms/0006-regular-expression.txt +1 -0
  99. data/spec/acceptance/invalid/0009-javascriptisms/0007-function.txt +7 -0
  100. data/spec/acceptance/invalid/0009-javascriptisms/0008-this.txt +1 -0
  101. data/spec/acceptance/invalid/0009-javascriptisms/0009-plusplus.txt +3 -0
  102. data/spec/acceptance/invalid/0009-javascriptisms/0010-ternary-operator.txt +1 -0
  103. data/spec/acceptance/valid/0001-ws/0001-space.json +1 -0
  104. data/spec/acceptance/valid/0001-ws/0002-tab.json +1 -0
  105. data/spec/acceptance/valid/0001-ws/0003-lf.json +1 -0
  106. data/spec/acceptance/valid/0001-ws/0004-cr.json +1 -0
  107. data/spec/acceptance/valid/0001-ws/0005-before.json +1 -0
  108. data/spec/acceptance/valid/0001-ws/0006-after.json +1 -0
  109. data/spec/acceptance/valid/0001-ws/0007-around-comma.json +3 -0
  110. data/spec/acceptance/valid/0001-ws/0008-around-colon.json +3 -0
  111. data/spec/acceptance/valid/0002-bare-values/0001-false.json +1 -0
  112. data/spec/acceptance/valid/0002-bare-values/0002-null.json +1 -0
  113. data/spec/acceptance/valid/0002-bare-values/0003-true.json +1 -0
  114. data/spec/acceptance/valid/0002-bare-values/0004-number.json +1 -0
  115. data/spec/acceptance/valid/0002-bare-values/0005-string.json +1 -0
  116. data/spec/acceptance/valid/0003-literals/0001-false.json +1 -0
  117. data/spec/acceptance/valid/0003-literals/0002-null.json +1 -0
  118. data/spec/acceptance/valid/0003-literals/0003-true.json +1 -0
  119. data/spec/acceptance/valid/0004-numbers/0000-zero.json +1 -0
  120. data/spec/acceptance/valid/0004-numbers/0001-one.json +1 -0
  121. data/spec/acceptance/valid/0004-numbers/0002-two.json +1 -0
  122. data/spec/acceptance/valid/0004-numbers/0003-three.json +1 -0
  123. data/spec/acceptance/valid/0004-numbers/0004-four.json +1 -0
  124. data/spec/acceptance/valid/0004-numbers/0005-five.json +1 -0
  125. data/spec/acceptance/valid/0004-numbers/0006-six.json +1 -0
  126. data/spec/acceptance/valid/0004-numbers/0007-seven.json +1 -0
  127. data/spec/acceptance/valid/0004-numbers/0008-eight.json +1 -0
  128. data/spec/acceptance/valid/0004-numbers/0009-nine.json +1 -0
  129. data/spec/acceptance/valid/0004-numbers/0010-ten.json +1 -0
  130. data/spec/acceptance/valid/0004-numbers/0011-minus.json +1 -0
  131. data/spec/acceptance/valid/0004-numbers/0012-fraction.json +1 -0
  132. data/spec/acceptance/valid/0004-numbers/0013-exponent.json +1 -0
  133. data/spec/acceptance/valid/0004-numbers/0014-exponent-minus.json +1 -0
  134. data/spec/acceptance/valid/0004-numbers/0015-exponent-plus.json +1 -0
  135. data/spec/acceptance/valid/0004-numbers/0016-complex.json +1 -0
  136. data/spec/acceptance/valid/0004-numbers/0017-DBL_MAX.json +1 -0
  137. data/spec/acceptance/valid/0004-numbers/0018-DBL_MIN.json +1 -0
  138. data/spec/acceptance/valid/0004-numbers/0019-subnormal-number.json +1 -0
  139. data/spec/acceptance/valid/0004-numbers/0020-1E400.json +1 -0
  140. data/spec/acceptance/valid/0004-numbers/0021-pi.json +1 -0
  141. data/spec/acceptance/valid/0004-numbers/0022-UINT32_MAX.json +1 -0
  142. data/spec/acceptance/valid/0004-numbers/0023-UINT64_MAX.json +1 -0
  143. data/spec/acceptance/valid/0004-numbers/0024-INT64_MIN.json +1 -0
  144. data/spec/acceptance/valid/0004-numbers/0025-high-resolution-zero.json +1 -0
  145. data/spec/acceptance/valid/0005-strings/0001-empty.json +1 -0
  146. data/spec/acceptance/valid/0005-strings/0002-basic-latin.json +1 -0
  147. data/spec/acceptance/valid/0005-strings/0003-escapes.json +1 -0
  148. data/spec/acceptance/valid/0005-strings/0004-raw-unicode.json +1 -0
  149. data/spec/acceptance/valid/0005-strings/0005-escaped-unicode.json +1 -0
  150. data/spec/acceptance/valid/0005-strings/0006-escaped-NUL.json +1 -0
  151. data/spec/acceptance/valid/0005-strings/0007-escaped-invalid-unicode-still-valid-as-json.json +1 -0
  152. data/spec/acceptance/valid/0005-strings/0008-ruby-json-gem-cant-handle-this.json +1 -0
  153. data/spec/acceptance/valid/0005-strings/0009-unescaped-invalid-javascript-still-valid-as-json.json +1 -0
  154. data/spec/acceptance/valid/0006-m17n/0001-genesis.json +6 -0
  155. data/spec/acceptance/valid/0006-m17n/0002-heart-sutra.json +5 -0
  156. data/spec/acceptance/valid/0006-m17n/0003-escaped-valid-surrogate-pair.json +1 -0
  157. data/spec/acceptance/valid/0006-m17n/0004-unescaped-valid-supplementary-multilingual-plane.json +1 -0
  158. data/spec/acceptance/valid/0007-arrays/0000-empty.json +1 -0
  159. data/spec/acceptance/valid/0007-arrays/0001-one-element.json +1 -0
  160. data/spec/acceptance/valid/0007-arrays/0002-multiple-elements.json +33 -0
  161. data/spec/acceptance/valid/0007-arrays/0003-various-types.json +1 -0
  162. data/spec/acceptance/valid/0007-arrays/0004-nested.json +17 -0
  163. data/spec/acceptance/valid/0008-hashes/0000-empty.json +1 -0
  164. data/spec/acceptance/valid/0008-hashes/0001-onekey.json +1 -0
  165. data/spec/acceptance/valid/0008-hashes/0002-many-keys.json +5 -0
  166. data/spec/acceptance/valid/0008-hashes/0003-empty-key.json +3 -0
  167. data/spec/acceptance/valid/0008-hashes/0004-true-value.json +3 -0
  168. data/spec/acceptance/valid/0008-hashes/0005-false-value.json +3 -0
  169. data/spec/acceptance/valid/0008-hashes/0006-null-value.json +3 -0
  170. data/spec/acceptance/valid/0008-hashes/0007-string-value.json +3 -0
  171. data/spec/acceptance/valid/0008-hashes/0008-numeric-value.json +3 -0
  172. data/spec/acceptance/valid/0008-hashes/0009-array-value.json +8 -0
  173. data/spec/acceptance/valid/0008-hashes/0010-hash-value.json +20 -0
  174. data/spec/acceptance/valid/0008-hashes/0011-duplicate-key-in-different-representations.json +4 -0
  175. data/spec/acceptance/valid/0008-hashes/0011-duplicate-key.json +4 -0
  176. data/spec/spec_helper.rb +54 -0
  177. metadata +520 -0
@@ -0,0 +1,267 @@
1
+ #! /your/favourite/path/to/racc
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright (c) 2014 Urabe, Shyouhei. All rights reserved.
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # - Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ #
12
+ # - Redistributions in binary form must reproduce the above copyright
13
+ # notice, this list of conditions and the following disclaimer in
14
+ # the documentation and/or other materials provided with the
15
+ # distribution.
16
+ #
17
+ # - Neither the name of Internet Society, IETF or IETF Trust, nor the
18
+ # names of specific contributors, may be used to endorse or promote
19
+ # products derived from this software without specific prior written
20
+ # permission.
21
+ #
22
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
23
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32
+ # POSSIBILITY OF SUCH DAMAGE.
33
+
34
+ # This is almost one-to-one translation of RFC7159 section 2 through 7, from
35
+ # Augmented BNF to Racc BNF. Should be the easiest to verify implementation
36
+ # against the spec.
37
+ #
38
+ # @note This parser has several shift/reduct conflicts. They are all around
39
+ # handling of white spaces (called "ws"), so can silently be ignored. I also
40
+ # checked the parser internal and made sure they are OK.
41
+ class RFC7159::Parser
42
+
43
+ options no_result_var
44
+ expect 28
45
+ rule
46
+
47
+ # Notes about nonterminal's names: in order to make manual verification
48
+ # easy, all the nonterminals that appear in the RFC are named as such. ABNF
49
+ # is much concise than plain BNF, so here we added several helper
50
+ # nonterminals; they are prefixed with "__" so you can distinguish if a
51
+ # nonterminal is RFC-origin or not.
52
+
53
+ # RFC7159 section 2
54
+
55
+ JSON_text : ws value ws { val[1] }
56
+ begin_array : ws "\x5B" ws # [ left square bracket
57
+ begin_object : ws "\x7B" ws # { left curly bracket
58
+ end_array : ws "\x5D" ws # ] right square bracket
59
+ end_object : ws "\x7D" ws # } right curly bracket
60
+ name_separator : ws "\x3A" ws # : colon
61
+ value_separator : ws "\x2C" ws # , comma
62
+ ws : # <- this is the '*' in the ABNF
63
+ | ws "\x20" # Space
64
+ | ws "\x09" # Horizontal tab
65
+ | ws "\x0A" # Line feed or New line
66
+ | ws "\x0D" # Carriage return
67
+
68
+ # RFC7159 section 3
69
+
70
+ value : false | null | true | object | array | number | string
71
+ false : "\x66" "\x61" "\x6c" "\x73" "\x65" { [ :false ] } # false
72
+ null : "\x6e" "\x75" "\x6c" "\x6c" { [ :null ] } # null
73
+ true : "\x74" "\x72" "\x75" "\x65" { [ :true ] } # true
74
+
75
+ # RFC7159 section 4
76
+
77
+ object : begin_object end_object { [ :object ] }
78
+ | begin_object __members__ end_object { [ :object, *val[1] ] }
79
+ __members__ : member { val }
80
+ | __members__ value_separator member { [ *val[0], val[2] ] }
81
+ member : string name_separator value { [ val[0], val[2] ] }
82
+
83
+ # RFC7159 section 5
84
+
85
+ array : begin_array end_array { [ :array ] }
86
+ | begin_array __list__ end_array { [ :array, *val[1] ] }
87
+ __list__ : value { val }
88
+ | __list__ value_separator value { [ *val[0], val[2] ] }
89
+
90
+ # RFC7159 section 6
91
+
92
+ number : __minus_p__ int __frac_p__ __exp_p__ { [ :number, *val ] }
93
+ __minus_p__ : | minus
94
+ __frac_p__ : | frac
95
+ __exp_p__ : | exp
96
+ decimal_point : "\x2E" # .
97
+ digit1_9 : "\x31" | "\x32" | "\x33" | "\x34" | "\x35"
98
+ | "\x36" | "\x37" | "\x38" | "\x39"
99
+ e : "\x65" | "\x45" # e E
100
+ exp : e __sign__ __digit_plus__ { val }
101
+ frac : decimal_point __digit_plus__ { val }
102
+ int : zero { val }
103
+ | digit1_9 { val }
104
+ | digit1_9 __digit_plus__ { [ val[0], *val[1] ] }
105
+ minus : "\x2D" # -
106
+ plus : "\x2B" # +
107
+ zero : "\x30" # 0
108
+ DIGIT : zero | digit1_9
109
+ __sign__ : | plus | minus
110
+ __digit_plus__ : DIGIT { val }
111
+ | __digit_plus__ DIGIT { [ *val[0], val[1] ] }
112
+
113
+ # RFC7159 section 7
114
+
115
+ string : quotation_mark quotation_mark { [ :string ] }
116
+ | quotation_mark __chars__ quotation_mark { [ :string, *val[1] ] }
117
+ __chars__ : char { val }
118
+ | __chars__ char { [ *val[0], val[1] ] }
119
+ char : unescaped | escape __ctrl__ { val }
120
+ __ctrl__ : "\x22" # " quotation mark U+0022
121
+ | "\x5C" # \ reverse solidus U+005C
122
+ | "\x2F" # / solidus U+002F
123
+ | "\x62" # b backspace U+0008
124
+ | "\x66" # f form feed U+000C
125
+ | "\x6E" # n line feed U+000A
126
+ | "\x72" # r carriage return U+000D
127
+ | "\x74" # t tab U+0009
128
+ | "\x75" # uXXXX U+XXXX
129
+ HEXDIG HEXDIG HEXDIG HEXDIG { val }
130
+ escape : "\x5C" # \
131
+ quotation_mark : "\x22" # "
132
+ HEXDIG : DIGIT
133
+ | "\x61" | "\x62" | "\x63" | "\x64" | "\x65" | "\x66"
134
+ | "\x41" | "\x42" | "\x43" | "\x44" | "\x45" | "\x46"
135
+
136
+ # "unescaped" is too much to list up here; use lexer instead.
137
+ # unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
138
+ end
139
+
140
+ ---- inner
141
+
142
+ # @param [true, false] accept_bom Whether to accept BOMs
143
+ # @param [true, false] yydebug Whether to enable debug mode
144
+ def initialize accept_bom: false, yydebug: false
145
+ @accept_bom = accept_bom
146
+ @yydebug = yydebug
147
+ end
148
+
149
+ # Parses str and generates AST. The str must consist of _a_ valid JSON
150
+ # text, otherwise an exception shall raise.
151
+ #
152
+ # @param [#each_char] str IO or String or something to parse
153
+ # @return [::Array] Parsed AST
154
+ # @raise [Racc::ParseError] The input is invalid
155
+ # @raise [Encoding::CompatibilityError] The input is invalid
156
+ def parse str
157
+ @state = :init
158
+ @enum = str.enum_for:each_char
159
+ firstchar = @enum.peek
160
+ @lineno = 1
161
+ @column = 1
162
+
163
+ case enc = firstchar.encoding
164
+ when Encoding::UTF_8,
165
+ Encoding::US_ASCII, # true subset of UTF-8
166
+ Encoding::UTF8_MAC, # true subset of UTF-8
167
+ Encoding::UTF_16LE,
168
+ Encoding::UTF_16BE,
169
+ Encoding::UTF_32LE,
170
+ Encoding::UTF_32BE
171
+ # RFC7159 sectoin 8.1 explicitly states that the input string must be
172
+ # either UTF 8, 16, or 32 -encoded. That point is as clear as the
173
+ # sky. All other encodings are NG. However, what we call the ASCII
174
+ # encoding is the true subset of UTF-8. A string of ASCII must also
175
+ # be valid as UTF-8. So we allow this.
176
+ #
177
+ # There are disucssions about parsing BOMs. The original RFC4627 said
178
+ # nothing about BOMs, however its section 3 ("Encoding") cannot be
179
+ # read as if it expected BOMs. Current RFC7159 _prohibits_ to
180
+ # generate JSON texts with BOMs but _allows_ to accept.
181
+ #
182
+ # This parser can control whether to accept BOMs.
183
+ if @accept_bom and firstchar == "\u{feff}".encode(enc)
184
+ @enum.next # consume
185
+ end
186
+ return do_parse
187
+ else
188
+ raise Encoding::CompatibilityError, <<-"end".gsub(/[\n\s]+/, ' ')
189
+ ``JSON text SHALL be encoded in UTF-8, UTF-16, or UTF-32'', said
190
+ RFC7159 section 8.1. The given string is NOT in any of those
191
+ encodings (but #{enc.inspect}).
192
+ end
193
+ end
194
+ end
195
+
196
+ private
197
+ def next_token
198
+ chr = @enum.next
199
+ tok = chr # dfault
200
+ newline, @newline = @newline, /[\r\n]/.match(chr)
201
+ if newline
202
+ @lineno += 1
203
+ @column = 1
204
+ else
205
+ @column += 1
206
+ end
207
+ case @state
208
+ when :string then # recap: unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
209
+ case chr.ord
210
+ when 0x20..0x21 then tok = :unescaped
211
+ when 0x22 then @state = :init # "
212
+ when 0x23..0x5B then tok = :unescaped
213
+ when 0x5C then @state = :escaped # \
214
+ when 0x5D..0x10FFFF then tok = :unescaped
215
+ else @state = :string # NG unicode
216
+ end
217
+ when :init then @state = (chr == '"') ? :string : :init
218
+ when :escaped then @state = (chr == 'u') ? :u1 : :string
219
+ when :u1 then @state = :u2
220
+ when :u2 then @state = :u3
221
+ when :u3 then @state = :u4
222
+ when :u4 then @state = :string
223
+ end
224
+ return tok, chr
225
+ rescue StopIteration
226
+ return false, @enum
227
+ end
228
+
229
+ def on_error id, val, stack
230
+ reason = case @state
231
+ when :string
232
+ 'this character is not allowed in a string; escape it.'
233
+ when :u1, :u2, :u3, :u4
234
+ '\uXXXX must exactly be a four-letter hexadecimal sequence.'
235
+ else
236
+ case val
237
+ when "'"
238
+ 'you must use " to quote strings'
239
+ when '}', ']', ','
240
+ 'possible extra (dangling) comma?'
241
+ when ':'
242
+ 'possible confusion of {} vs []?'
243
+ when /\s/
244
+ 'possible space inside of a number?'
245
+ when /\d/
246
+ 'possible lack of +/- in exponent?'
247
+ else
248
+ 'unexpected character'
249
+ end
250
+ end
251
+ msg = sprintf 'Syntax error near line %d, char %d (%p): %s',
252
+ @lineno, @column, val, reason
253
+ raise Racc::ParseError, msg
254
+ end
255
+
256
+ ---- footer
257
+
258
+ #
259
+ # Local Variables:
260
+ # mode: ruby
261
+ # coding: utf-8-unix
262
+ # indent-tabs-mode: t
263
+ # tab-width: 3
264
+ # ruby-indent-level: 3
265
+ # fill-column: 79
266
+ # default-justification: full
267
+ # End:
@@ -0,0 +1,221 @@
1
+ #! /your/favourite/path/to/ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright (c) 2014 Urabe, Shyouhei. All rights reserved.
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are met:
8
+ #
9
+ # - Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ #
12
+ # - Redistributions in binary form must reproduce the above copyright
13
+ # notice, this list of conditions and the following disclaimer in
14
+ # the documentation and/or other materials provided with the
15
+ # distribution.
16
+ #
17
+ # - Neither the name of Internet Society, IETF or IETF Trust, nor the
18
+ # names of specific contributors, may be used to endorse or promote
19
+ # products derived from this software without specific prior written
20
+ # permission.
21
+ #
22
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
23
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32
+ # POSSIBILITY OF SUCH DAMAGE.
33
+
34
+ # The Strings, as described in RFC7159 section 7.
35
+ class RFC7159::String < RFC7159::Value
36
+ # Parse the AST from parser, and convert into corrsponding values.
37
+ # @param [::Array] ast the AST, generated by the parser
38
+ # @return [String] evaluated instance
39
+ # @raise [ArgumentError] malformed input
40
+ def self.from_ast ast
41
+ type, *ary = *ast
42
+ raise ArgumentError, "not an object: #{ast.inspect}" if type != :string
43
+ new ary
44
+ end
45
+
46
+ # @return [::String] converte string
47
+ def plain_old_ruby_object
48
+ return @str
49
+ end
50
+
51
+ alias to_s plain_old_ruby_object
52
+ alias to_str plain_old_ruby_object
53
+
54
+ # @return [::String] the string, escaped
55
+ def inspect
56
+ sprintf "#<%p:%#016x %p>", self.class, self.object_id << 1, @str
57
+ end
58
+
59
+ # For pretty print
60
+ # @param [PP] pp the pp
61
+ def pretty_print pp
62
+ hdr = sprintf '#<%p:%#016x', self.class, self.object_id << 1
63
+ pp.group 1, hdr, '>' do
64
+ pp.breakable
65
+ @str.pretty_print pp
66
+ end
67
+ end
68
+
69
+ # @return [string] original string
70
+ def to_json *;
71
+ '"' << @orig.flatten.join('') << '"'
72
+ end
73
+
74
+ # String comparisons are defined in RFC7159 section 8.3. We follow that.
75
+ def == other
76
+ self.to_str == other.to_str
77
+ rescue NoMethodError
78
+ return false
79
+ end
80
+
81
+ private
82
+ private_class_method:new
83
+ # @private
84
+ def initialize ary
85
+ @orig = ary
86
+ enc = ary[0][0].encoding rescue Encoding::US_ASCII # empty string
87
+ path1 = ary.map do |i|
88
+ case i when Array
89
+ # ['\\', ['u', 'F', 'F', 'E', 'E']] or something
90
+ case i[1]
91
+ when "\x22" then 0x0022 # " quotation mark U+0022
92
+ when "\x5C" then 0x005C # \ reverse solidus U+005C
93
+ when "\x2F" then 0x002F # / solidus U+002F
94
+ when "\x62" then 0x0008 # b backspace U+0008
95
+ when "\x66" then 0x000C # f form feed U+000C
96
+ when "\x6E" then 0x000A # n line feed U+000A
97
+ when "\x72" then 0x000D # r carriage return U+000D
98
+ when "\x74" then 0x0009 # t tab U+0009
99
+ else # uXXXX U+XXXX
100
+ i[1][1..4].join.to_i 16
101
+ end
102
+ else
103
+ i.ord
104
+ end
105
+ end
106
+
107
+ # RFC7159 section 8.1 states that the JSON text itself shall be written
108
+ # in a sort of Unicode. However the parsed JSON value's content strings
109
+ # are not always Unicode-valid, according to its section 8.2. Then what?
110
+ # It says nothing. Here, we try to preserve the JSON text's encoding
111
+ # i.e. if the JSON text is in UTF-16, we try UTF-16. If that doesn't
112
+ # fit, we give up and take BINARY.
113
+ buf = nil
114
+ path2 = path1.each_with_object Array.new do |i, r|
115
+ if buf.nil?
116
+ next buf = i
117
+ else
118
+ case buf when 0xD800..0xDBFF
119
+ case i when 0xDC00..0xDFFF
120
+ # valid surrogate pair
121
+ utf16str = [buf, i].pack 'nn'
122
+ utf16str.force_encoding Encoding::UTF_16BE
123
+ r << utf16str[0].ord
124
+ buf = nil # consumed
125
+ else
126
+ # buf is a garbage
127
+ r << buf
128
+ buf = i
129
+ end
130
+ else
131
+ # buf is a normal char
132
+ r << buf
133
+ buf = i
134
+ end
135
+ end
136
+ end
137
+ path2 << buf if buf # buf might remain
138
+
139
+ path3 = path2.each_with_object ''.b do |i, r|
140
+ case enc
141
+ when Encoding::UTF_32BE then j = [i].pack 'N'
142
+ when Encoding::UTF_32LE then j = [i].pack 'V'
143
+ when Encoding::UTF_16BE then j = [i].pack 'n'
144
+ when Encoding::UTF_16LE then j = [i].pack 'v'
145
+ else j = [i].pack 'U' # sort of UTF-8
146
+ end
147
+ r << j.b
148
+ end
149
+ path4 = path3.dup.force_encoding enc
150
+ # @str = path4.valid_encoding? ? path4 : path3
151
+ @str = path4
152
+ @str.freeze
153
+ end
154
+ end
155
+
156
+ #
157
+ # Dialogue about evaluating JSON's string
158
+ # ----
159
+ # 2014.03.17.txt:20:50:01 >#ruby-ja@ircnet:shyouhei < JSONのRFC、文字列が"\uDEAD"とかなっててもvalidだよって書いてあるけど、
160
+ # 2014.03.17.txt:20:50:14 >#ruby-ja@ircnet:shyouhei < それはいいのだが
161
+ # 2014.03.17.txt:20:50:32 >#ruby-ja@ircnet:shyouhei < たとえばそのJSONがUTF-16で書かれているとして
162
+ # 2014.03.17.txt:20:50:59 >#ruby-ja@ircnet:shyouhei < UTF-16の"\uDEAD"的なのをRubyで作ろうと思うとなかなかむずかしいな
163
+ # 2014.03.17.txt:20:51:55 >#ruby-ja@ircnet:shyouhei < "\\uDEAD"という文字列(ただしUTF-16)を入力したら"\u{DEAD}"という文字列(ただしUTF-16)を出力する関数
164
+ # 2014.03.17.txt:20:52:08 >#ruby-ja@ircnet:shyouhei < むずい。
165
+ # 2014.03.17.txt:20:52:09 <#ruby-ja@ircnet:nurse > "\xDE\xAD".force_encoding("utf-16be")とかになっちゃいますなぁ
166
+ # 2014.03.17.txt:20:52:34 <#ruby-ja@ircnet:nurse > [0xDEAD].pack("n").force_encoding("utf-16be")のが素直かな
167
+ # 2014.03.17.txt:20:53:35 >#ruby-ja@ircnet:shyouhei < なんか実務上はそこまでがんばるより例外で死んだ方がしあわせになれそうではある
168
+ # 2014.03.17.txt:20:54:00 >#ruby-ja@ircnet:shyouhei < 誰も幸せにしなさそう
169
+ # 2014.03.17.txt:20:54:26 <#ruby-ja@ircnet:nurse > 死んじゃダメで、ゲタにするのが正解じゃないっけ
170
+ # 2014.03.17.txt:20:54:54 >#ruby-ja@ircnet:shyouhei < それがより正しそうですね
171
+ # 2014.03.17.txt:20:55:56 >#ruby-ja@ircnet:shyouhei < JSONはサロゲートペアもなんとかせねばならんので面倒そうだ
172
+ # 2014.03.17.txt:20:57:06 >#ruby-ja@ircnet:shyouhei < (\uXYZW が単体でNGぽいくても次にサロゲートペアが続くかもしれん)
173
+ # 2014.03.17.txt:20:57:37 >#ruby-ja@ircnet:shyouhei < めんどう!
174
+ # 2014.03.17.txt:20:57:42 >#ruby-ja@ircnet:shyouhei < UTF16しねばいいのに
175
+ # 2014.03.17.txt:20:59:06 <#ruby-ja@ircnet:nurse > とりあえずそのままUTF-16にしてみて、encodeでinvalid replaceすればいい気がする
176
+ # 2014.03.17.txt:21:00:33 >#ruby-ja@ircnet:shyouhei < すでにUTF16な文字列にサロゲートペアの片割れ的なバイナリをがしょがしょって後ろから足してからencodeするとよしなにする?
177
+ # 2014.03.17.txt:21:01:13 >#ruby-ja@ircnet:shyouhei < (頭の悪い発言なのは自覚しております)
178
+ # 2014.03.17.txt:21:01:29 <#ruby-ja@ircnet:nurse > invalid: :replaceつけてUTF-8にするなり、UTF-16のままscrubすれば
179
+ # 2014.03.17.txt:21:02:45 >#ruby-ja@ircnet:shyouhei < invalidなのはよいとして "\uFOO\uBAR" てきなサロゲートペアてきJSON文字列をちゃんとRuby的に(正しいUTF16文字列)に復元するシナリオ
180
+ # 2014.03.17.txt:21:03:46 <#ruby-ja@ircnet:nurse > たぶんAScii-8BITで足さないとエラーになる気がする
181
+ # 2014.03.17.txt:21:04:05 <#ruby-ja@ircnet:nurse > そこいがいは、無心につなげて、最後にencodeまたはscrubが正解ではないかと
182
+ # 2014.03.17.txt:21:04:13 >#ruby-ja@ircnet:shyouhei < あきらめて全部バイナリと思ってくっつけておいてから最後にencodeか
183
+ # 2014.03.17.txt:21:05:20 <#ruby-ja@ircnet:nurse > ASCII-8BITだと文字列のvalidチェックしない分速いし。
184
+ # 2014.03.17.txt:21:06:33 >#ruby-ja@ircnet:shyouhei < 世の中のJSONパーザがUTF16サポートしないという姿勢にはそれなりの理由があることがわかった。
185
+ # 2014.03.17.txt:21:07:17 <#ruby-ja@ircnet:nurse > そもそもHTTPで文字列流すのにASCII非互換ってのが邪悪である
186
+ # 2014.03.17.txt:21:15:04 <#ruby-ja@ircnet:nurse > 例のOpenBSDのsignifyをportableにしたらRubyでも使えるかなぁ
187
+ # 2014.03.17.txt:21:18:39 <#ruby-ja@ircnet:nurse > ていうか卜部さんはJSONパーサでも書いてるのかしら
188
+ # 2014.03.17.txt:21:18:56 <#ruby-ja@ircnet:nurse > って、聞いちゃいけない質問な気がした
189
+ # ----
190
+ # 2014.03.25.txt:16:08:14 >#ruby-ja@ircnet:shyouhei < "\u{dead}" を入力されたときに "\\uDEAD" を出力する関数を作成せよ
191
+ # 2014.03.25.txt:16:09:21 >#ruby-ja@ircnet:shyouhei < str.force_encoding('utf-8').scrub {|c| "\\u" + c.unpack('H*") } はだめぽい
192
+ # 2014.03.25.txt:16:14:13 >#ruby-ja@ircnet:shyouhei < primitive_convertでなんとかなるのかこれ
193
+ # 2014.03.25.txt:16:20:10 <#ruby-ja@ircnet:n0kada > "\u{dead}"ってinvalidなんだっけ
194
+ # 2014.03.25.txt:16:22:29 >#ruby-ja@ircnet:shyouhei < サロゲートペアのかたほう
195
+ # 2014.03.25.txt:16:22:44 >#ruby-ja@ircnet:shyouhei < それだけではinvalidすね
196
+ # 2014.03.25.txt:16:34:47 >#ruby-ja@ircnet:shyouhei < お、"\u{dead}".unpack('U*')で0xdeadが取得できる
197
+ # 2014.03.25.txt:16:34:57 >#ruby-ja@ircnet:shyouhei < ここからなんとかすればいいのか…?
198
+ # 2014.03.25.txt:16:35:00 >#ruby-ja@ircnet:shyouhei < しかしどうする
199
+ # 2014.03.25.txt:16:35:08 <#ruby-ja@ircnet:akr > "\u{dead}".unpack("U*").map {|c| 0xD800 <= c && c <= 0xDFFF ? "\\u%04X" % c : [c].pack("U") }.join
200
+ # 2014.03.25.txt:16:38:16 >#ruby-ja@ircnet:shyouhei < おお。
201
+ # 2014.03.25.txt:16:38:46 >#ruby-ja@ircnet:shyouhei < scrubでなんとかするのは筋が悪いことが分かりつつある
202
+ # 2014.03.25.txt:16:39:36 >#ruby-ja@ircnet:shyouhei < まずは文字列じゃなくてコードポイントの配列にして、そこでごにょってから、さいごに文字列になおすのが色々正しい雰囲気を感じる
203
+ # 2014.03.25.txt:16:39:53 <#ruby-ja@ircnet:akr > encoding が壊れている時に、文字の範囲を確定するのは難しいので。
204
+ # 2014.03.25.txt:16:43:08 <#ruby-ja@ircnet:n0kada > unpackはサロゲートペアの片割れも扱える仕様なんだっけ
205
+ # 2014.03.25.txt:16:43:41 <#ruby-ja@ircnet:akr > 仕様かどうかは知らない
206
+ # 2014.03.25.txt:16:44:36 <#ruby-ja@ircnet:akr > 伝統的に寛大だったとは思う
207
+ # 2014.03.25.txt:16:45:41 (#ruby-ja@ircnet:n0kada ) $ grep -r surrogate spec/rubyspec/core/string/unpack/
208
+ # 2014.03.25.txt:16:45:42 (#ruby-ja@ircnet:n0kada ) bash: exit 1
209
+ # 2014.03.25.txt:16:46:06 <#ruby-ja@ircnet:n0kada > rubyspecが持ってないとは意外だな
210
+ # 2014.03.25.txt:16:46:18 <#ruby-ja@ircnet:n0kada > こういう重箱の隅はお得意だろうに
211
+
212
+ #
213
+ # Local Variables:
214
+ # mode: ruby
215
+ # coding: utf-8-unix
216
+ # indent-tabs-mode: t
217
+ # tab-width: 3
218
+ # ruby-indent-level: 3
219
+ # fill-column: 79
220
+ # default-justification: full
221
+ # End: