benofsky-yajl-ruby 0.7.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/.gitignore +9 -0
  2. data/CHANGELOG.md +281 -0
  3. data/MIT-LICENSE +20 -0
  4. data/README.rdoc +320 -0
  5. data/Rakefile +40 -0
  6. data/VERSION.yml +5 -0
  7. data/benchmark/encode.rb +58 -0
  8. data/benchmark/encode_json_and_marshal.rb +42 -0
  9. data/benchmark/encode_json_and_yaml.rb +53 -0
  10. data/benchmark/http.rb +32 -0
  11. data/benchmark/parse.rb +59 -0
  12. data/benchmark/parse_json_and_marshal.rb +50 -0
  13. data/benchmark/parse_json_and_yaml.rb +55 -0
  14. data/benchmark/parse_stream.rb +54 -0
  15. data/benchmark/subjects/item.json +1 -0
  16. data/benchmark/subjects/ohai.json +1216 -0
  17. data/benchmark/subjects/ohai.marshal_dump +0 -0
  18. data/benchmark/subjects/ohai.yml +975 -0
  19. data/benchmark/subjects/twitter_search.json +1 -0
  20. data/benchmark/subjects/twitter_stream.json +430 -0
  21. data/benchmark/subjects/unicode.json +1 -0
  22. data/examples/encoding/chunked_encoding.rb +27 -0
  23. data/examples/encoding/one_shot.rb +13 -0
  24. data/examples/encoding/to_an_io.rb +12 -0
  25. data/examples/http/twitter_search_api.rb +12 -0
  26. data/examples/http/twitter_stream_api.rb +26 -0
  27. data/examples/parsing/from_file.rb +14 -0
  28. data/examples/parsing/from_stdin.rb +9 -0
  29. data/examples/parsing/from_string.rb +13 -0
  30. data/ext/api/yajl_common.h +85 -0
  31. data/ext/api/yajl_gen.h +159 -0
  32. data/ext/api/yajl_parse.h +196 -0
  33. data/ext/extconf.rb +9 -0
  34. data/ext/yajl.c +164 -0
  35. data/ext/yajl_alloc.c +65 -0
  36. data/ext/yajl_alloc.h +50 -0
  37. data/ext/yajl_buf.c +119 -0
  38. data/ext/yajl_buf.h +73 -0
  39. data/ext/yajl_bytestack.h +85 -0
  40. data/ext/yajl_encode.c +188 -0
  41. data/ext/yajl_encode.h +50 -0
  42. data/ext/yajl_ext.c +911 -0
  43. data/ext/yajl_ext.h +128 -0
  44. data/ext/yajl_gen.c +317 -0
  45. data/ext/yajl_lex.c +747 -0
  46. data/ext/yajl_lex.h +135 -0
  47. data/ext/yajl_parser.c +450 -0
  48. data/ext/yajl_parser.h +82 -0
  49. data/lib/yajl/bzip2/stream_reader.rb +32 -0
  50. data/lib/yajl/bzip2/stream_writer.rb +15 -0
  51. data/lib/yajl/bzip2.rb +11 -0
  52. data/lib/yajl/deflate/stream_reader.rb +44 -0
  53. data/lib/yajl/deflate/stream_writer.rb +21 -0
  54. data/lib/yajl/deflate.rb +6 -0
  55. data/lib/yajl/gzip/stream_reader.rb +31 -0
  56. data/lib/yajl/gzip/stream_writer.rb +14 -0
  57. data/lib/yajl/gzip.rb +6 -0
  58. data/lib/yajl/http_stream.rb +197 -0
  59. data/lib/yajl/json_gem/encoding.rb +50 -0
  60. data/lib/yajl/json_gem/parsing.rb +27 -0
  61. data/lib/yajl/json_gem.rb +14 -0
  62. data/lib/yajl.rb +93 -0
  63. data/spec/encoding/encoding_spec.rb +234 -0
  64. data/spec/global/global_spec.rb +55 -0
  65. data/spec/http/fixtures/http.bzip2.dump +0 -0
  66. data/spec/http/fixtures/http.chunked.dump +11 -0
  67. data/spec/http/fixtures/http.deflate.dump +0 -0
  68. data/spec/http/fixtures/http.error.dump +12 -0
  69. data/spec/http/fixtures/http.gzip.dump +0 -0
  70. data/spec/http/fixtures/http.html.dump +1220 -0
  71. data/spec/http/fixtures/http.raw.dump +1226 -0
  72. data/spec/http/http_delete_spec.rb +99 -0
  73. data/spec/http/http_error_spec.rb +33 -0
  74. data/spec/http/http_get_spec.rb +110 -0
  75. data/spec/http/http_post_spec.rb +124 -0
  76. data/spec/http/http_put_spec.rb +106 -0
  77. data/spec/json_gem_compatibility/compatibility_spec.rb +203 -0
  78. data/spec/parsing/active_support_spec.rb +64 -0
  79. data/spec/parsing/chunked_spec.rb +98 -0
  80. data/spec/parsing/fixtures/fail.15.json +1 -0
  81. data/spec/parsing/fixtures/fail.16.json +1 -0
  82. data/spec/parsing/fixtures/fail.17.json +1 -0
  83. data/spec/parsing/fixtures/fail.26.json +1 -0
  84. data/spec/parsing/fixtures/fail11.json +1 -0
  85. data/spec/parsing/fixtures/fail12.json +1 -0
  86. data/spec/parsing/fixtures/fail13.json +1 -0
  87. data/spec/parsing/fixtures/fail14.json +1 -0
  88. data/spec/parsing/fixtures/fail19.json +1 -0
  89. data/spec/parsing/fixtures/fail20.json +1 -0
  90. data/spec/parsing/fixtures/fail21.json +1 -0
  91. data/spec/parsing/fixtures/fail22.json +1 -0
  92. data/spec/parsing/fixtures/fail23.json +1 -0
  93. data/spec/parsing/fixtures/fail24.json +1 -0
  94. data/spec/parsing/fixtures/fail25.json +1 -0
  95. data/spec/parsing/fixtures/fail27.json +2 -0
  96. data/spec/parsing/fixtures/fail28.json +2 -0
  97. data/spec/parsing/fixtures/fail3.json +1 -0
  98. data/spec/parsing/fixtures/fail4.json +1 -0
  99. data/spec/parsing/fixtures/fail5.json +1 -0
  100. data/spec/parsing/fixtures/fail6.json +1 -0
  101. data/spec/parsing/fixtures/fail9.json +1 -0
  102. data/spec/parsing/fixtures/pass.array.json +6 -0
  103. data/spec/parsing/fixtures/pass.codepoints_from_unicode_org.json +1 -0
  104. data/spec/parsing/fixtures/pass.contacts.json +1 -0
  105. data/spec/parsing/fixtures/pass.db100.xml.json +1 -0
  106. data/spec/parsing/fixtures/pass.db1000.xml.json +1 -0
  107. data/spec/parsing/fixtures/pass.dc_simple_with_comments.json +11 -0
  108. data/spec/parsing/fixtures/pass.deep_arrays.json +1 -0
  109. data/spec/parsing/fixtures/pass.difficult_json_c_test_case.json +1 -0
  110. data/spec/parsing/fixtures/pass.difficult_json_c_test_case_with_comments.json +1 -0
  111. data/spec/parsing/fixtures/pass.doubles.json +1 -0
  112. data/spec/parsing/fixtures/pass.empty_array.json +1 -0
  113. data/spec/parsing/fixtures/pass.empty_string.json +1 -0
  114. data/spec/parsing/fixtures/pass.escaped_bulgarian.json +4 -0
  115. data/spec/parsing/fixtures/pass.escaped_foobar.json +1 -0
  116. data/spec/parsing/fixtures/pass.item.json +1 -0
  117. data/spec/parsing/fixtures/pass.json-org-sample1.json +23 -0
  118. data/spec/parsing/fixtures/pass.json-org-sample2.json +11 -0
  119. data/spec/parsing/fixtures/pass.json-org-sample3.json +26 -0
  120. data/spec/parsing/fixtures/pass.json-org-sample4-nows.json +88 -0
  121. data/spec/parsing/fixtures/pass.json-org-sample4.json +89 -0
  122. data/spec/parsing/fixtures/pass.json-org-sample5.json +27 -0
  123. data/spec/parsing/fixtures/pass.map-spain.xml.json +1 -0
  124. data/spec/parsing/fixtures/pass.ns-invoice100.xml.json +1 -0
  125. data/spec/parsing/fixtures/pass.ns-soap.xml.json +1 -0
  126. data/spec/parsing/fixtures/pass.numbers-fp-4k.json +6 -0
  127. data/spec/parsing/fixtures/pass.numbers-fp-64k.json +61 -0
  128. data/spec/parsing/fixtures/pass.numbers-int-4k.json +11 -0
  129. data/spec/parsing/fixtures/pass.numbers-int-64k.json +154 -0
  130. data/spec/parsing/fixtures/pass.twitter-search.json +1 -0
  131. data/spec/parsing/fixtures/pass.twitter-search2.json +1 -0
  132. data/spec/parsing/fixtures/pass.unicode.json +3315 -0
  133. data/spec/parsing/fixtures/pass.yelp.json +1 -0
  134. data/spec/parsing/fixtures/pass1.json +56 -0
  135. data/spec/parsing/fixtures/pass2.json +1 -0
  136. data/spec/parsing/fixtures/pass3.json +6 -0
  137. data/spec/parsing/fixtures_spec.rb +41 -0
  138. data/spec/parsing/one_off_spec.rb +81 -0
  139. data/spec/rcov.opts +3 -0
  140. data/spec/spec.opts +2 -0
  141. data/spec/spec_helper.rb +16 -0
  142. data/yajl-ruby.gemspec +203 -0
  143. metadata +232 -0
data/ext/yajl_lex.c ADDED
@@ -0,0 +1,747 @@
1
+ /*
2
+ * Copyright 2010, Lloyd Hilaiel.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions are
6
+ * met:
7
+ *
8
+ * 1. Redistributions of source code must retain the above copyright
9
+ * notice, this list of conditions and the following disclaimer.
10
+ *
11
+ * 2. Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in
13
+ * the documentation and/or other materials provided with the
14
+ * distribution.
15
+ *
16
+ * 3. Neither the name of Lloyd Hilaiel nor the names of its
17
+ * contributors may be used to endorse or promote products derived
18
+ * from this software without specific prior written permission.
19
+ *
20
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
24
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
+ * POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+ #include "yajl_lex.h"
34
+ #include "yajl_buf.h"
35
+
36
+ #include <stdlib.h>
37
+ #include <stdio.h>
38
+ #include <assert.h>
39
+ #include <string.h>
40
+
41
+ #ifdef YAJL_LEXER_DEBUG
42
+ static const char *
43
+ tokToStr(yajl_tok tok)
44
+ {
45
+ switch (tok) {
46
+ case yajl_tok_bool: return "bool";
47
+ case yajl_tok_colon: return "colon";
48
+ case yajl_tok_comma: return "comma";
49
+ case yajl_tok_eof: return "eof";
50
+ case yajl_tok_error: return "error";
51
+ case yajl_tok_left_brace: return "brace";
52
+ case yajl_tok_left_bracket: return "bracket";
53
+ case yajl_tok_null: return "null";
54
+ case yajl_tok_integer: return "integer";
55
+ case yajl_tok_double: return "double";
56
+ case yajl_tok_right_brace: return "brace";
57
+ case yajl_tok_right_bracket: return "bracket";
58
+ case yajl_tok_string: return "string";
59
+ case yajl_tok_string_with_escapes: return "string_with_escapes";
60
+ }
61
+ return "unknown";
62
+ }
63
+ #endif
64
+
65
+ /* Impact of the stream parsing feature on the lexer:
66
+ *
67
+ * YAJL support stream parsing. That is, the ability to parse the first
68
+ * bits of a chunk of JSON before the last bits are available (still on
69
+ * the network or disk). This makes the lexer more complex. The
70
+ * responsibility of the lexer is to handle transparently the case where
71
+ * a chunk boundary falls in the middle of a token. This is
72
+ * accomplished is via a buffer and a character reading abstraction.
73
+ *
74
+ * Overview of implementation
75
+ *
76
+ * When we lex to end of input string before end of token is hit, we
77
+ * copy all of the input text composing the token into our lexBuf.
78
+ *
79
+ * Every time we read a character, we do so through the readChar function.
80
+ * readChar's responsibility is to handle pulling all chars from the buffer
81
+ * before pulling chars from input text
82
+ */
83
+
84
+ struct yajl_lexer_t {
85
+ /* the overal line and char offset into the data */
86
+ unsigned int lineOff;
87
+ unsigned int charOff;
88
+
89
+ /* error */
90
+ yajl_lex_error error;
91
+
92
+ /* a input buffer to handle the case where a token is spread over
93
+ * multiple chunks */
94
+ yajl_buf buf;
95
+
96
+ /* in the case where we have data in the lexBuf, bufOff holds
97
+ * the current offset into the lexBuf. */
98
+ unsigned int bufOff;
99
+
100
+ /* are we using the lex buf? */
101
+ unsigned int bufInUse;
102
+
103
+ /* shall we allow comments? */
104
+ unsigned int allowComments;
105
+
106
+ /* shall we validate utf8 inside strings? */
107
+ unsigned int validateUTF8;
108
+
109
+ yajl_alloc_funcs * alloc;
110
+ };
111
+
112
+ #define readChar(lxr, txt, off) \
113
+ (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
114
+ (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
115
+ ((txt)[(*(off))++]))
116
+
117
+ #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
118
+
119
+ yajl_lexer
120
+ yajl_lex_alloc(yajl_alloc_funcs * alloc,
121
+ unsigned int allowComments, unsigned int validateUTF8)
122
+ {
123
+ yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
124
+ memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
125
+ lxr->buf = yajl_buf_alloc(alloc);
126
+ lxr->allowComments = allowComments;
127
+ lxr->validateUTF8 = validateUTF8;
128
+ lxr->alloc = alloc;
129
+ return lxr;
130
+ }
131
+
132
+ yajl_lexer
133
+ yajl_lex_realloc(yajl_lexer orig) {
134
+ yajl_buf_clear(orig->buf);
135
+ orig->bufInUse = 0;
136
+ orig->bufOff = 0;
137
+ orig->lineOff = 0;
138
+ orig->lineOff = 0;
139
+ return orig;
140
+ }
141
+
142
+ void
143
+ yajl_lex_free(yajl_lexer lxr)
144
+ {
145
+ yajl_buf_free(lxr->buf);
146
+ YA_FREE(lxr->alloc, lxr);
147
+ return;
148
+ }
149
+
150
+ /* a lookup table which lets us quickly determine three things:
151
+ * VEC - valid escaped conrol char
152
+ * IJC - invalid json char
153
+ * VHC - valid hex char
154
+ * note. the solidus '/' may be escaped or not.
155
+ * note. the
156
+ */
157
+ #define VEC 1
158
+ #define IJC 2
159
+ #define VHC 4
160
+ static const char charLookupTable[256] =
161
+ {
162
+ /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
163
+ /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
164
+ /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
165
+ /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
166
+
167
+ /*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
168
+ /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
169
+ /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
170
+ /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
171
+
172
+ /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
173
+ /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
174
+ /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
175
+ /*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 ,
176
+
177
+ /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
178
+ /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
179
+ /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
180
+ /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
181
+
182
+ /* include these so we don't have to always check the range of the char */
183
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
184
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
185
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
186
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
187
+
188
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
189
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
190
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
191
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
192
+
193
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
194
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
195
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
196
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
197
+
198
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
199
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
200
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
201
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
202
+ };
203
+
204
+ /** process a variable length utf8 encoded codepoint.
205
+ *
206
+ * returns:
207
+ * yajl_tok_string - if valid utf8 char was parsed and offset was
208
+ * advanced
209
+ * yajl_tok_eof - if end of input was hit before validation could
210
+ * complete
211
+ * yajl_tok_error - if invalid utf8 was encountered
212
+ *
213
+ * NOTE: on error the offset will point to the first char of the
214
+ * invalid utf8 */
215
+ #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
216
+
217
+ static yajl_tok
218
+ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
219
+ unsigned int jsonTextLen, unsigned int * offset,
220
+ unsigned char curChar)
221
+ {
222
+ if (curChar <= 0x7f) {
223
+ /* single byte */
224
+ return yajl_tok_string;
225
+ } else if ((curChar >> 5) == 0x6) {
226
+ /* two byte */
227
+ UTF8_CHECK_EOF;
228
+ curChar = readChar(lexer, jsonText, offset);
229
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
230
+ } else if ((curChar >> 4) == 0x0e) {
231
+ /* three byte */
232
+ UTF8_CHECK_EOF;
233
+ curChar = readChar(lexer, jsonText, offset);
234
+ if ((curChar >> 6) == 0x2) {
235
+ UTF8_CHECK_EOF;
236
+ curChar = readChar(lexer, jsonText, offset);
237
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
238
+ }
239
+ } else if ((curChar >> 3) == 0x1e) {
240
+ /* four byte */
241
+ UTF8_CHECK_EOF;
242
+ curChar = readChar(lexer, jsonText, offset);
243
+ if ((curChar >> 6) == 0x2) {
244
+ UTF8_CHECK_EOF;
245
+ curChar = readChar(lexer, jsonText, offset);
246
+ if ((curChar >> 6) == 0x2) {
247
+ UTF8_CHECK_EOF;
248
+ curChar = readChar(lexer, jsonText, offset);
249
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
250
+ }
251
+ }
252
+ }
253
+
254
+ return yajl_tok_error;
255
+ }
256
+
257
+ /* lex a string. input is the lexer, pointer to beginning of
258
+ * json text, and start of string (offset).
259
+ * a token is returned which has the following meanings:
260
+ * yajl_tok_string: lex of string was successful. offset points to
261
+ * terminating '"'.
262
+ * yajl_tok_eof: end of text was encountered before we could complete
263
+ * the lex.
264
+ * yajl_tok_error: embedded in the string were unallowable chars. offset
265
+ * points to the offending char
266
+ */
267
+ #define STR_CHECK_EOF \
268
+ if (*offset >= jsonTextLen) { \
269
+ tok = yajl_tok_eof; \
270
+ goto finish_string_lex; \
271
+ }
272
+
273
+ static yajl_tok
274
+ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
275
+ unsigned int jsonTextLen, unsigned int * offset)
276
+ {
277
+ yajl_tok tok = yajl_tok_error;
278
+ int hasEscapes = 0;
279
+
280
+ for (;;) {
281
+ unsigned char curChar;
282
+
283
+ STR_CHECK_EOF;
284
+
285
+ curChar = readChar(lexer, jsonText, offset);
286
+
287
+ /* quote terminates */
288
+ if (curChar == '"') {
289
+ tok = yajl_tok_string;
290
+ break;
291
+ }
292
+ /* backslash escapes a set of control chars, */
293
+ else if (curChar == '\\') {
294
+ hasEscapes = 1;
295
+ STR_CHECK_EOF;
296
+
297
+ /* special case \u */
298
+ curChar = readChar(lexer, jsonText, offset);
299
+ if (curChar == 'u') {
300
+ unsigned int i = 0;
301
+
302
+ for (i=0;i<4;i++) {
303
+ STR_CHECK_EOF;
304
+ curChar = readChar(lexer, jsonText, offset);
305
+ if (!(charLookupTable[curChar] & VHC)) {
306
+ /* back up to offending char */
307
+ unreadChar(lexer, offset);
308
+ lexer->error = yajl_lex_string_invalid_hex_char;
309
+ goto finish_string_lex;
310
+ }
311
+ }
312
+ } else if (!(charLookupTable[curChar] & VEC)) {
313
+ /* back up to offending char */
314
+ unreadChar(lexer, offset);
315
+ lexer->error = yajl_lex_string_invalid_escaped_char;
316
+ goto finish_string_lex;
317
+ }
318
+ }
319
+ /* when not validating UTF8 it's a simple table lookup to determine
320
+ * if the present character is invalid */
321
+ else if(charLookupTable[curChar] & IJC) {
322
+ /* back up to offending char */
323
+ unreadChar(lexer, offset);
324
+ lexer->error = yajl_lex_string_invalid_json_char;
325
+ goto finish_string_lex;
326
+ }
327
+ /* when in validate UTF8 mode we need to do some extra work */
328
+ else if (lexer->validateUTF8) {
329
+ yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
330
+ offset, curChar);
331
+
332
+ if (t == yajl_tok_eof) {
333
+ tok = yajl_tok_eof;
334
+ goto finish_string_lex;
335
+ } else if (t == yajl_tok_error) {
336
+ lexer->error = yajl_lex_string_invalid_utf8;
337
+ goto finish_string_lex;
338
+ }
339
+ }
340
+ /* accept it, and move on */
341
+ }
342
+ finish_string_lex:
343
+ /* tell our buddy, the parser, wether he needs to process this string
344
+ * again */
345
+ if (hasEscapes && tok == yajl_tok_string) {
346
+ tok = yajl_tok_string_with_escapes;
347
+ }
348
+
349
+ return tok;
350
+ }
351
+
352
+ #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
353
+
354
+ static yajl_tok
355
+ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
356
+ unsigned int jsonTextLen, unsigned int * offset)
357
+ {
358
+ /** XXX: numbers are the only entities in json that we must lex
359
+ * _beyond_ in order to know that they are complete. There
360
+ * is an ambiguous case for integers at EOF. */
361
+
362
+ unsigned char c;
363
+
364
+ yajl_tok tok = yajl_tok_integer;
365
+
366
+ RETURN_IF_EOF;
367
+ c = readChar(lexer, jsonText, offset);
368
+
369
+ /* optional leading minus */
370
+ if (c == '-') {
371
+ RETURN_IF_EOF;
372
+ c = readChar(lexer, jsonText, offset);
373
+ }
374
+
375
+ /* a single zero, or a series of integers */
376
+ if (c == '0') {
377
+ RETURN_IF_EOF;
378
+ c = readChar(lexer, jsonText, offset);
379
+ } else if (c >= '1' && c <= '9') {
380
+ do {
381
+ RETURN_IF_EOF;
382
+ c = readChar(lexer, jsonText, offset);
383
+ } while (c >= '0' && c <= '9');
384
+ } else {
385
+ unreadChar(lexer, offset);
386
+ lexer->error = yajl_lex_missing_integer_after_minus;
387
+ return yajl_tok_error;
388
+ }
389
+
390
+ /* optional fraction (indicates this is floating point) */
391
+ if (c == '.') {
392
+ int numRd = 0;
393
+
394
+ RETURN_IF_EOF;
395
+ c = readChar(lexer, jsonText, offset);
396
+
397
+ while (c >= '0' && c <= '9') {
398
+ numRd++;
399
+ RETURN_IF_EOF;
400
+ c = readChar(lexer, jsonText, offset);
401
+ }
402
+
403
+ if (!numRd) {
404
+ unreadChar(lexer, offset);
405
+ lexer->error = yajl_lex_missing_integer_after_decimal;
406
+ return yajl_tok_error;
407
+ }
408
+ tok = yajl_tok_double;
409
+ }
410
+
411
+ /* optional exponent (indicates this is floating point) */
412
+ if (c == 'e' || c == 'E') {
413
+ RETURN_IF_EOF;
414
+ c = readChar(lexer, jsonText, offset);
415
+
416
+ /* optional sign */
417
+ if (c == '+' || c == '-') {
418
+ RETURN_IF_EOF;
419
+ c = readChar(lexer, jsonText, offset);
420
+ }
421
+
422
+ if (c >= '0' && c <= '9') {
423
+ do {
424
+ RETURN_IF_EOF;
425
+ c = readChar(lexer, jsonText, offset);
426
+ } while (c >= '0' && c <= '9');
427
+ } else {
428
+ unreadChar(lexer, offset);
429
+ lexer->error = yajl_lex_missing_integer_after_exponent;
430
+ return yajl_tok_error;
431
+ }
432
+ tok = yajl_tok_double;
433
+ }
434
+
435
+ /* we always go "one too far" */
436
+ unreadChar(lexer, offset);
437
+
438
+ return tok;
439
+ }
440
+
441
+ static yajl_tok
442
+ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
443
+ unsigned int jsonTextLen, unsigned int * offset)
444
+ {
445
+ unsigned char c;
446
+
447
+ yajl_tok tok = yajl_tok_comment;
448
+
449
+ RETURN_IF_EOF;
450
+ c = readChar(lexer, jsonText, offset);
451
+
452
+ /* either slash or star expected */
453
+ if (c == '/') {
454
+ /* now we throw away until end of line */
455
+ do {
456
+ RETURN_IF_EOF;
457
+ c = readChar(lexer, jsonText, offset);
458
+ } while (c != '\n');
459
+ } else if (c == '*') {
460
+ /* now we throw away until end of comment */
461
+ for (;;) {
462
+ RETURN_IF_EOF;
463
+ c = readChar(lexer, jsonText, offset);
464
+ if (c == '*') {
465
+ RETURN_IF_EOF;
466
+ c = readChar(lexer, jsonText, offset);
467
+ if (c == '/') {
468
+ break;
469
+ } else {
470
+ unreadChar(lexer, offset);
471
+ }
472
+ }
473
+ }
474
+ } else {
475
+ lexer->error = yajl_lex_invalid_char;
476
+ tok = yajl_tok_error;
477
+ }
478
+
479
+ return tok;
480
+ }
481
+
482
+ yajl_tok
483
+ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
484
+ unsigned int jsonTextLen, unsigned int * offset,
485
+ const unsigned char ** outBuf, unsigned int * outLen)
486
+ {
487
+ yajl_tok tok = yajl_tok_error;
488
+ unsigned char c;
489
+ unsigned int startOffset = *offset;
490
+
491
+ *outBuf = NULL;
492
+ *outLen = 0;
493
+
494
+ for (;;) {
495
+ assert(*offset <= jsonTextLen);
496
+
497
+ if (*offset >= jsonTextLen) {
498
+ tok = yajl_tok_eof;
499
+ goto lexed;
500
+ }
501
+
502
+ c = readChar(lexer, jsonText, offset);
503
+
504
+ switch (c) {
505
+ case '{':
506
+ tok = yajl_tok_left_bracket;
507
+ goto lexed;
508
+ case '}':
509
+ tok = yajl_tok_right_bracket;
510
+ goto lexed;
511
+ case '[':
512
+ tok = yajl_tok_left_brace;
513
+ goto lexed;
514
+ case ']':
515
+ tok = yajl_tok_right_brace;
516
+ goto lexed;
517
+ case ',':
518
+ tok = yajl_tok_comma;
519
+ goto lexed;
520
+ case ':':
521
+ tok = yajl_tok_colon;
522
+ goto lexed;
523
+ case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
524
+ startOffset++;
525
+ break;
526
+ case 't': {
527
+ const char * want = "rue";
528
+ do {
529
+ if (*offset >= jsonTextLen) {
530
+ tok = yajl_tok_eof;
531
+ goto lexed;
532
+ }
533
+ c = readChar(lexer, jsonText, offset);
534
+ if (c != *want) {
535
+ unreadChar(lexer, offset);
536
+ lexer->error = yajl_lex_invalid_string;
537
+ tok = yajl_tok_error;
538
+ goto lexed;
539
+ }
540
+ } while (*(++want));
541
+ tok = yajl_tok_bool;
542
+ goto lexed;
543
+ }
544
+ case 'f': {
545
+ const char * want = "alse";
546
+ do {
547
+ if (*offset >= jsonTextLen) {
548
+ tok = yajl_tok_eof;
549
+ goto lexed;
550
+ }
551
+ c = readChar(lexer, jsonText, offset);
552
+ if (c != *want) {
553
+ unreadChar(lexer, offset);
554
+ lexer->error = yajl_lex_invalid_string;
555
+ tok = yajl_tok_error;
556
+ goto lexed;
557
+ }
558
+ } while (*(++want));
559
+ tok = yajl_tok_bool;
560
+ goto lexed;
561
+ }
562
+ case 'n': {
563
+ const char * want = "ull";
564
+ do {
565
+ if (*offset >= jsonTextLen) {
566
+ tok = yajl_tok_eof;
567
+ goto lexed;
568
+ }
569
+ c = readChar(lexer, jsonText, offset);
570
+ if (c != *want) {
571
+ unreadChar(lexer, offset);
572
+ lexer->error = yajl_lex_invalid_string;
573
+ tok = yajl_tok_error;
574
+ goto lexed;
575
+ }
576
+ } while (*(++want));
577
+ tok = yajl_tok_null;
578
+ goto lexed;
579
+ }
580
+ case '"': {
581
+ tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
582
+ jsonTextLen, offset);
583
+ goto lexed;
584
+ }
585
+ case '-':
586
+ case '0': case '1': case '2': case '3': case '4':
587
+ case '5': case '6': case '7': case '8': case '9': {
588
+ /* integer parsing wants to start from the beginning */
589
+ unreadChar(lexer, offset);
590
+ tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
591
+ jsonTextLen, offset);
592
+ goto lexed;
593
+ }
594
+ case '/':
595
+ /* hey, look, a probable comment! If comments are disabled
596
+ * it's an error. */
597
+ if (!lexer->allowComments) {
598
+ unreadChar(lexer, offset);
599
+ lexer->error = yajl_lex_unallowed_comment;
600
+ tok = yajl_tok_error;
601
+ goto lexed;
602
+ }
603
+ /* if comments are enabled, then we should try to lex
604
+ * the thing. possible outcomes are
605
+ * - successful lex (tok_comment, which means continue),
606
+ * - malformed comment opening (slash not followed by
607
+ * '*' or '/') (tok_error)
608
+ * - eof hit. (tok_eof) */
609
+ tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
610
+ jsonTextLen, offset);
611
+ if (tok == yajl_tok_comment) {
612
+ /* "error" is silly, but that's the initial
613
+ * state of tok. guilty until proven innocent. */
614
+ tok = yajl_tok_error;
615
+ yajl_buf_clear(lexer->buf);
616
+ lexer->bufInUse = 0;
617
+ startOffset = *offset;
618
+ break;
619
+ }
620
+ /* hit error or eof, bail */
621
+ goto lexed;
622
+ default:
623
+ lexer->error = yajl_lex_invalid_char;
624
+ tok = yajl_tok_error;
625
+ goto lexed;
626
+ }
627
+ }
628
+
629
+
630
+ lexed:
631
+ /* need to append to buffer if the buffer is in use or
632
+ * if it's an EOF token */
633
+ if (tok == yajl_tok_eof || lexer->bufInUse) {
634
+ if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
635
+ lexer->bufInUse = 1;
636
+ yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
637
+ lexer->bufOff = 0;
638
+
639
+ if (tok != yajl_tok_eof) {
640
+ *outBuf = yajl_buf_data(lexer->buf);
641
+ *outLen = yajl_buf_len(lexer->buf);
642
+ lexer->bufInUse = 0;
643
+ }
644
+ } else if (tok != yajl_tok_error) {
645
+ *outBuf = jsonText + startOffset;
646
+ *outLen = *offset - startOffset;
647
+ }
648
+
649
+ /* special case for strings. skip the quotes. */
650
+ if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
651
+ {
652
+ assert(*outLen >= 2);
653
+ (*outBuf)++;
654
+ *outLen -= 2;
655
+ }
656
+
657
+
658
+ #ifdef YAJL_LEXER_DEBUG
659
+ if (tok == yajl_tok_error) {
660
+ printf("lexical error: %s\n",
661
+ yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
662
+ } else if (tok == yajl_tok_eof) {
663
+ printf("EOF hit\n");
664
+ } else {
665
+ printf("lexed %s: '", tokToStr(tok));
666
+ fwrite(*outBuf, 1, *outLen, stdout);
667
+ printf("'\n");
668
+ }
669
+ #endif
670
+
671
+ return tok;
672
+ }
673
+
674
+ const char *
675
+ yajl_lex_error_to_string(yajl_lex_error error)
676
+ {
677
+ switch (error) {
678
+ case yajl_lex_e_ok:
679
+ return "ok, no error";
680
+ case yajl_lex_string_invalid_utf8:
681
+ return "invalid bytes in UTF8 string.";
682
+ case yajl_lex_string_invalid_escaped_char:
683
+ return "inside a string, '\\' occurs before a character "
684
+ "which it may not.";
685
+ case yajl_lex_string_invalid_json_char:
686
+ return "invalid character inside string.";
687
+ case yajl_lex_string_invalid_hex_char:
688
+ return "invalid (non-hex) character occurs after '\\u' inside "
689
+ "string.";
690
+ case yajl_lex_invalid_char:
691
+ return "invalid char in json text.";
692
+ case yajl_lex_invalid_string:
693
+ return "invalid string in json text.";
694
+ case yajl_lex_missing_integer_after_exponent:
695
+ return "malformed number, a digit is required after the exponent.";
696
+ case yajl_lex_missing_integer_after_decimal:
697
+ return "malformed number, a digit is required after the "
698
+ "decimal point.";
699
+ case yajl_lex_missing_integer_after_minus:
700
+ return "malformed number, a digit is required after the "
701
+ "minus sign.";
702
+ case yajl_lex_unallowed_comment:
703
+ return "probable comment found in input text, comments are "
704
+ "not enabled.";
705
+ }
706
+ return "unknown error code";
707
+ }
708
+
709
+
710
+ /** allows access to more specific information about the lexical
711
+ * error when yajl_lex_lex returns yajl_tok_error. */
712
+ yajl_lex_error
713
+ yajl_lex_get_error(yajl_lexer lexer)
714
+ {
715
+ if (lexer == NULL) return (yajl_lex_error) -1;
716
+ return lexer->error;
717
+ }
718
+
719
+ unsigned int yajl_lex_current_line(yajl_lexer lexer)
720
+ {
721
+ return lexer->lineOff;
722
+ }
723
+
724
+ unsigned int yajl_lex_current_char(yajl_lexer lexer)
725
+ {
726
+ return lexer->charOff;
727
+ }
728
+
729
+ yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
730
+ unsigned int jsonTextLen, unsigned int offset)
731
+ {
732
+ const unsigned char * outBuf;
733
+ unsigned int outLen;
734
+ unsigned int bufLen = yajl_buf_len(lexer->buf);
735
+ unsigned int bufOff = lexer->bufOff;
736
+ unsigned int bufInUse = lexer->bufInUse;
737
+ yajl_tok tok;
738
+
739
+ tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
740
+ &outBuf, &outLen);
741
+
742
+ lexer->bufOff = bufOff;
743
+ lexer->bufInUse = bufInUse;
744
+ yajl_buf_truncate(lexer->buf, bufLen);
745
+
746
+ return tok;
747
+ }