brianmario-yajl-ruby 0.4.6 → 0.4.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +10 -0
- data/README.rdoc +25 -23
- data/Rakefile +11 -1
- data/VERSION.yml +2 -2
- data/benchmark/encode.rb +1 -1
- data/benchmark/encode_json_and_marshal.rb +1 -1
- data/benchmark/encode_json_and_yaml.rb +1 -1
- data/benchmark/parse.rb +1 -1
- data/benchmark/parse_json_and_marshal.rb +1 -1
- data/benchmark/parse_json_and_yaml.rb +1 -1
- data/benchmark/subjects/unicode.json +1 -3315
- data/ext/api/yajl_common.h +85 -0
- data/ext/api/yajl_gen.h +123 -0
- data/ext/api/yajl_parse.h +179 -0
- data/ext/extconf.rb +2 -8
- data/ext/yajl.c +128 -260
- data/ext/yajl_alloc.c +65 -0
- data/ext/yajl_alloc.h +50 -0
- data/ext/yajl_buf.c +119 -0
- data/ext/yajl_buf.h +73 -0
- data/ext/yajl_bytestack.h +85 -0
- data/ext/yajl_encode.c +179 -0
- data/ext/yajl_encode.h +44 -0
- data/ext/yajl_ext.c +283 -0
- data/ext/{yajl.h → yajl_ext.h} +2 -2
- data/ext/yajl_gen.c +295 -0
- data/ext/yajl_lex.c +737 -0
- data/ext/yajl_lex.h +133 -0
- data/ext/yajl_parser.c +445 -0
- data/ext/yajl_parser.h +79 -0
- data/lib/yajl/bzip2.rb +1 -1
- data/lib/yajl/deflate.rb +1 -1
- data/lib/yajl/gzip.rb +1 -1
- data/lib/yajl/http_stream.rb +1 -1
- data/lib/yajl.rb +1 -1
- data/spec/encoding/encoding_spec.rb +23 -0
- data/spec/http/{http.bzip2.dump → fixtures/http.bzip2.dump} +0 -0
- data/spec/http/{http.deflate.dump → fixtures/http.deflate.dump} +0 -0
- data/spec/http/{http.gzip.dump → fixtures/http.gzip.dump} +0 -0
- data/spec/http/{http.raw.dump → fixtures/http.raw.dump} +0 -0
- data/spec/http/http_spec.rb +98 -0
- data/spec/{active_support_spec.rb → parsing/active_support_spec.rb} +1 -1
- data/spec/{fixtures → parsing/fixtures}/fail.15.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail.16.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail.17.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail.26.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail11.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail12.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail13.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail14.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail19.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail20.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail21.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail22.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail23.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail24.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail25.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail27.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail28.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail3.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail4.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail5.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail6.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/fail9.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.array.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.codepoints_from_unicode_org.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.contacts.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.db100.xml.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.db1000.xml.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.dc_simple_with_comments.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.deep_arrays.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.difficult_json_c_test_case.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.difficult_json_c_test_case_with_comments.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.doubles.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.empty_array.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.empty_string.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.escaped_bulgarian.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.escaped_foobar.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.item.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.json-org-sample1.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.json-org-sample2.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.json-org-sample3.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.json-org-sample4-nows.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.json-org-sample4.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.json-org-sample5.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.map-spain.xml.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.ns-invoice100.xml.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.ns-soap.xml.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.numbers-fp-4k.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.numbers-fp-64k.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.numbers-int-4k.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.numbers-int-64k.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.twitter-search.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.twitter-search2.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.unicode.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass.yelp.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass1.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass2.json +0 -0
- data/spec/{fixtures → parsing/fixtures}/pass3.json +0 -0
- data/spec/{fixtures_spec.rb → parsing/fixtures_spec.rb} +1 -1
- data/spec/{one_off_spec.rb → parsing/one_off_spec.rb} +1 -1
- data/yajl-ruby.gemspec +91 -72
- metadata +91 -71
- data/benchmark/subjects/yelp.json +0 -1
data/ext/yajl_lex.c
ADDED
@@ -0,0 +1,737 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright 2007-2009, Lloyd Hilaiel.
|
3
|
+
*
|
4
|
+
* Redistribution and use in source and binary forms, with or without
|
5
|
+
* modification, are permitted provided that the following conditions are
|
6
|
+
* met:
|
7
|
+
*
|
8
|
+
* 1. Redistributions of source code must retain the above copyright
|
9
|
+
* notice, this list of conditions and the following disclaimer.
|
10
|
+
*
|
11
|
+
* 2. Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in
|
13
|
+
* the documentation and/or other materials provided with the
|
14
|
+
* distribution.
|
15
|
+
*
|
16
|
+
* 3. Neither the name of Lloyd Hilaiel nor the names of its
|
17
|
+
* contributors may be used to endorse or promote products derived
|
18
|
+
* from this software without specific prior written permission.
|
19
|
+
*
|
20
|
+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
21
|
+
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
22
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
23
|
+
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
24
|
+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
25
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
26
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
27
|
+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
28
|
+
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
29
|
+
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
30
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
31
|
+
*/
|
32
|
+
|
33
|
+
#include "yajl_lex.h"
|
34
|
+
#include "yajl_buf.h"
|
35
|
+
|
36
|
+
#include <stdlib.h>
|
37
|
+
#include <stdio.h>
|
38
|
+
#include <assert.h>
|
39
|
+
#include <string.h>
|
40
|
+
|
41
|
+
#ifdef YAJL_LEXER_DEBUG
|
42
|
+
static const char *
|
43
|
+
tokToStr(yajl_tok tok)
|
44
|
+
{
|
45
|
+
switch (tok) {
|
46
|
+
case yajl_tok_bool: return "bool";
|
47
|
+
case yajl_tok_colon: return "colon";
|
48
|
+
case yajl_tok_comma: return "comma";
|
49
|
+
case yajl_tok_eof: return "eof";
|
50
|
+
case yajl_tok_error: return "error";
|
51
|
+
case yajl_tok_left_brace: return "brace";
|
52
|
+
case yajl_tok_left_bracket: return "bracket";
|
53
|
+
case yajl_tok_null: return "null";
|
54
|
+
case yajl_tok_integer: return "integer";
|
55
|
+
case yajl_tok_double: return "double";
|
56
|
+
case yajl_tok_right_brace: return "brace";
|
57
|
+
case yajl_tok_right_bracket: return "bracket";
|
58
|
+
case yajl_tok_string: return "string";
|
59
|
+
case yajl_tok_string_with_escapes: return "string_with_escapes";
|
60
|
+
}
|
61
|
+
return "unknown";
|
62
|
+
}
|
63
|
+
#endif
|
64
|
+
|
65
|
+
/* Impact of the stream parsing feature on the lexer:
|
66
|
+
*
|
67
|
+
* YAJL support stream parsing. That is, the ability to parse the first
|
68
|
+
* bits of a chunk of JSON before the last bits are available (still on
|
69
|
+
* the network or disk). This makes the lexer more complex. The
|
70
|
+
* responsibility of the lexer is to handle transparently the case where
|
71
|
+
* a chunk boundary falls in the middle of a token. This is
|
72
|
+
* accomplished is via a buffer and a character reading abstraction.
|
73
|
+
*
|
74
|
+
* Overview of implementation
|
75
|
+
*
|
76
|
+
* When we lex to end of input string before end of token is hit, we
|
77
|
+
* copy all of the input text composing the token into our lexBuf.
|
78
|
+
*
|
79
|
+
* Every time we read a character, we do so through the readChar function.
|
80
|
+
* readChar's responsibility is to handle pulling all chars from the buffer
|
81
|
+
* before pulling chars from input text
|
82
|
+
*/
|
83
|
+
|
84
|
+
struct yajl_lexer_t {
|
85
|
+
/* the overal line and char offset into the data */
|
86
|
+
unsigned int lineOff;
|
87
|
+
unsigned int charOff;
|
88
|
+
|
89
|
+
/* error */
|
90
|
+
yajl_lex_error error;
|
91
|
+
|
92
|
+
/* a input buffer to handle the case where a token is spread over
|
93
|
+
* multiple chunks */
|
94
|
+
yajl_buf buf;
|
95
|
+
|
96
|
+
/* in the case where we have data in the lexBuf, bufOff holds
|
97
|
+
* the current offset into the lexBuf. */
|
98
|
+
unsigned int bufOff;
|
99
|
+
|
100
|
+
/* are we using the lex buf? */
|
101
|
+
unsigned int bufInUse;
|
102
|
+
|
103
|
+
/* shall we allow comments? */
|
104
|
+
unsigned int allowComments;
|
105
|
+
|
106
|
+
/* shall we validate utf8 inside strings? */
|
107
|
+
unsigned int validateUTF8;
|
108
|
+
|
109
|
+
yajl_alloc_funcs * alloc;
|
110
|
+
};
|
111
|
+
|
112
|
+
#define readChar(lxr, txt, off) \
|
113
|
+
(((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
|
114
|
+
(*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
|
115
|
+
((txt)[(*(off))++]))
|
116
|
+
|
117
|
+
#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
|
118
|
+
|
119
|
+
yajl_lexer
|
120
|
+
yajl_lex_alloc(yajl_alloc_funcs * alloc,
|
121
|
+
unsigned int allowComments, unsigned int validateUTF8)
|
122
|
+
{
|
123
|
+
yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
|
124
|
+
memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
|
125
|
+
lxr->buf = yajl_buf_alloc(alloc);
|
126
|
+
lxr->allowComments = allowComments;
|
127
|
+
lxr->validateUTF8 = validateUTF8;
|
128
|
+
lxr->alloc = alloc;
|
129
|
+
return lxr;
|
130
|
+
}
|
131
|
+
|
132
|
+
void
|
133
|
+
yajl_lex_free(yajl_lexer lxr)
|
134
|
+
{
|
135
|
+
yajl_buf_free(lxr->buf);
|
136
|
+
YA_FREE(lxr->alloc, lxr);
|
137
|
+
return;
|
138
|
+
}
|
139
|
+
|
140
|
+
/* a lookup table which lets us quickly determine three things:
|
141
|
+
* VEC - valid escaped conrol char
|
142
|
+
* IJC - invalid json char
|
143
|
+
* VHC - valid hex char
|
144
|
+
* note. the solidus '/' may be escaped or not.
|
145
|
+
* note. the
|
146
|
+
*/
|
147
|
+
#define VEC 1
|
148
|
+
#define IJC 2
|
149
|
+
#define VHC 4
|
150
|
+
static const char charLookupTable[256] =
|
151
|
+
{
|
152
|
+
/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
|
153
|
+
/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
|
154
|
+
/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
|
155
|
+
/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
|
156
|
+
|
157
|
+
/*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
|
158
|
+
/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
|
159
|
+
/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
|
160
|
+
/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
|
161
|
+
|
162
|
+
/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
|
163
|
+
/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
164
|
+
/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
165
|
+
/*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 ,
|
166
|
+
|
167
|
+
/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
|
168
|
+
/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
|
169
|
+
/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
|
170
|
+
/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
171
|
+
|
172
|
+
/* include these so we don't have to always check the range of the char */
|
173
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
174
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
175
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
176
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
177
|
+
|
178
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
179
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
180
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
181
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
182
|
+
|
183
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
184
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
185
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
186
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
187
|
+
|
188
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
189
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
190
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
|
191
|
+
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
|
192
|
+
};
|
193
|
+
|
194
|
+
/** process a variable length utf8 encoded codepoint.
|
195
|
+
*
|
196
|
+
* returns:
|
197
|
+
* yajl_tok_string - if valid utf8 char was parsed and offset was
|
198
|
+
* advanced
|
199
|
+
* yajl_tok_eof - if end of input was hit before validation could
|
200
|
+
* complete
|
201
|
+
* yajl_tok_error - if invalid utf8 was encountered
|
202
|
+
*
|
203
|
+
* NOTE: on error the offset will point to the first char of the
|
204
|
+
* invalid utf8 */
|
205
|
+
#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
|
206
|
+
|
207
|
+
static yajl_tok
|
208
|
+
yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
|
209
|
+
unsigned int jsonTextLen, unsigned int * offset,
|
210
|
+
unsigned char curChar)
|
211
|
+
{
|
212
|
+
if (curChar <= 0x7f) {
|
213
|
+
/* single byte */
|
214
|
+
return yajl_tok_string;
|
215
|
+
} else if ((curChar >> 5) == 0x6) {
|
216
|
+
/* two byte */
|
217
|
+
UTF8_CHECK_EOF;
|
218
|
+
curChar = readChar(lexer, jsonText, offset);
|
219
|
+
if ((curChar >> 6) == 0x2) return yajl_tok_string;
|
220
|
+
} else if ((curChar >> 4) == 0x0e) {
|
221
|
+
/* three byte */
|
222
|
+
UTF8_CHECK_EOF;
|
223
|
+
curChar = readChar(lexer, jsonText, offset);
|
224
|
+
if ((curChar >> 6) == 0x2) {
|
225
|
+
UTF8_CHECK_EOF;
|
226
|
+
curChar = readChar(lexer, jsonText, offset);
|
227
|
+
if ((curChar >> 6) == 0x2) return yajl_tok_string;
|
228
|
+
}
|
229
|
+
} else if ((curChar >> 3) == 0x1e) {
|
230
|
+
/* four byte */
|
231
|
+
UTF8_CHECK_EOF;
|
232
|
+
curChar = readChar(lexer, jsonText, offset);
|
233
|
+
if ((curChar >> 6) == 0x2) {
|
234
|
+
UTF8_CHECK_EOF;
|
235
|
+
curChar = readChar(lexer, jsonText, offset);
|
236
|
+
if ((curChar >> 6) == 0x2) {
|
237
|
+
UTF8_CHECK_EOF;
|
238
|
+
curChar = readChar(lexer, jsonText, offset);
|
239
|
+
if ((curChar >> 6) == 0x2) return yajl_tok_string;
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
return yajl_tok_error;
|
245
|
+
}
|
246
|
+
|
247
|
+
/* lex a string. input is the lexer, pointer to beginning of
|
248
|
+
* json text, and start of string (offset).
|
249
|
+
* a token is returned which has the following meanings:
|
250
|
+
* yajl_tok_string: lex of string was successful. offset points to
|
251
|
+
* terminating '"'.
|
252
|
+
* yajl_tok_eof: end of text was encountered before we could complete
|
253
|
+
* the lex.
|
254
|
+
* yajl_tok_error: embedded in the string were unallowable chars. offset
|
255
|
+
* points to the offending char
|
256
|
+
*/
|
257
|
+
#define STR_CHECK_EOF \
|
258
|
+
if (*offset >= jsonTextLen) { \
|
259
|
+
tok = yajl_tok_eof; \
|
260
|
+
goto finish_string_lex; \
|
261
|
+
}
|
262
|
+
|
263
|
+
static yajl_tok
|
264
|
+
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
|
265
|
+
unsigned int jsonTextLen, unsigned int * offset)
|
266
|
+
{
|
267
|
+
yajl_tok tok = yajl_tok_error;
|
268
|
+
int hasEscapes = 0;
|
269
|
+
|
270
|
+
for (;;) {
|
271
|
+
unsigned char curChar;
|
272
|
+
|
273
|
+
STR_CHECK_EOF;
|
274
|
+
|
275
|
+
curChar = readChar(lexer, jsonText, offset);
|
276
|
+
|
277
|
+
/* quote terminates */
|
278
|
+
if (curChar == '"') {
|
279
|
+
tok = yajl_tok_string;
|
280
|
+
break;
|
281
|
+
}
|
282
|
+
/* backslash escapes a set of control chars, */
|
283
|
+
else if (curChar == '\\') {
|
284
|
+
hasEscapes = 1;
|
285
|
+
STR_CHECK_EOF;
|
286
|
+
|
287
|
+
/* special case \u */
|
288
|
+
curChar = readChar(lexer, jsonText, offset);
|
289
|
+
if (curChar == 'u') {
|
290
|
+
unsigned int i = 0;
|
291
|
+
|
292
|
+
for (i=0;i<4;i++) {
|
293
|
+
STR_CHECK_EOF;
|
294
|
+
curChar = readChar(lexer, jsonText, offset);
|
295
|
+
if (!(charLookupTable[curChar] & VHC)) {
|
296
|
+
/* back up to offending char */
|
297
|
+
unreadChar(lexer, offset);
|
298
|
+
lexer->error = yajl_lex_string_invalid_hex_char;
|
299
|
+
goto finish_string_lex;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
} else if (!(charLookupTable[curChar] & VEC)) {
|
303
|
+
/* back up to offending char */
|
304
|
+
unreadChar(lexer, offset);
|
305
|
+
lexer->error = yajl_lex_string_invalid_escaped_char;
|
306
|
+
goto finish_string_lex;
|
307
|
+
}
|
308
|
+
}
|
309
|
+
/* when not validating UTF8 it's a simple table lookup to determine
|
310
|
+
* if the present character is invalid */
|
311
|
+
else if(charLookupTable[curChar] & IJC) {
|
312
|
+
/* back up to offending char */
|
313
|
+
unreadChar(lexer, offset);
|
314
|
+
lexer->error = yajl_lex_string_invalid_json_char;
|
315
|
+
goto finish_string_lex;
|
316
|
+
}
|
317
|
+
/* when in validate UTF8 mode we need to do some extra work */
|
318
|
+
else if (lexer->validateUTF8) {
|
319
|
+
yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
|
320
|
+
offset, curChar);
|
321
|
+
|
322
|
+
if (t == yajl_tok_eof) {
|
323
|
+
tok = yajl_tok_eof;
|
324
|
+
goto finish_string_lex;
|
325
|
+
} else if (t == yajl_tok_error) {
|
326
|
+
lexer->error = yajl_lex_string_invalid_utf8;
|
327
|
+
goto finish_string_lex;
|
328
|
+
}
|
329
|
+
}
|
330
|
+
/* accept it, and move on */
|
331
|
+
}
|
332
|
+
finish_string_lex:
|
333
|
+
/* tell our buddy, the parser, wether he needs to process this string
|
334
|
+
* again */
|
335
|
+
if (hasEscapes && tok == yajl_tok_string) {
|
336
|
+
tok = yajl_tok_string_with_escapes;
|
337
|
+
}
|
338
|
+
|
339
|
+
return tok;
|
340
|
+
}
|
341
|
+
|
342
|
+
#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
|
343
|
+
|
344
|
+
static yajl_tok
|
345
|
+
yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
|
346
|
+
unsigned int jsonTextLen, unsigned int * offset)
|
347
|
+
{
|
348
|
+
/** XXX: numbers are the only entities in json that we must lex
|
349
|
+
* _beyond_ in order to know that they are complete. There
|
350
|
+
* is an ambiguous case for integers at EOF. */
|
351
|
+
|
352
|
+
unsigned char c;
|
353
|
+
|
354
|
+
yajl_tok tok = yajl_tok_integer;
|
355
|
+
|
356
|
+
RETURN_IF_EOF;
|
357
|
+
c = readChar(lexer, jsonText, offset);
|
358
|
+
|
359
|
+
/* optional leading minus */
|
360
|
+
if (c == '-') {
|
361
|
+
RETURN_IF_EOF;
|
362
|
+
c = readChar(lexer, jsonText, offset);
|
363
|
+
}
|
364
|
+
|
365
|
+
/* a single zero, or a series of integers */
|
366
|
+
if (c == '0') {
|
367
|
+
RETURN_IF_EOF;
|
368
|
+
c = readChar(lexer, jsonText, offset);
|
369
|
+
} else if (c >= '1' && c <= '9') {
|
370
|
+
do {
|
371
|
+
RETURN_IF_EOF;
|
372
|
+
c = readChar(lexer, jsonText, offset);
|
373
|
+
} while (c >= '0' && c <= '9');
|
374
|
+
} else {
|
375
|
+
unreadChar(lexer, offset);
|
376
|
+
lexer->error = yajl_lex_missing_integer_after_minus;
|
377
|
+
return yajl_tok_error;
|
378
|
+
}
|
379
|
+
|
380
|
+
/* optional fraction (indicates this is floating point) */
|
381
|
+
if (c == '.') {
|
382
|
+
int numRd = 0;
|
383
|
+
|
384
|
+
RETURN_IF_EOF;
|
385
|
+
c = readChar(lexer, jsonText, offset);
|
386
|
+
|
387
|
+
while (c >= '0' && c <= '9') {
|
388
|
+
numRd++;
|
389
|
+
RETURN_IF_EOF;
|
390
|
+
c = readChar(lexer, jsonText, offset);
|
391
|
+
}
|
392
|
+
|
393
|
+
if (!numRd) {
|
394
|
+
unreadChar(lexer, offset);
|
395
|
+
lexer->error = yajl_lex_missing_integer_after_decimal;
|
396
|
+
return yajl_tok_error;
|
397
|
+
}
|
398
|
+
tok = yajl_tok_double;
|
399
|
+
}
|
400
|
+
|
401
|
+
/* optional exponent (indicates this is floating point) */
|
402
|
+
if (c == 'e' || c == 'E') {
|
403
|
+
RETURN_IF_EOF;
|
404
|
+
c = readChar(lexer, jsonText, offset);
|
405
|
+
|
406
|
+
/* optional sign */
|
407
|
+
if (c == '+' || c == '-') {
|
408
|
+
RETURN_IF_EOF;
|
409
|
+
c = readChar(lexer, jsonText, offset);
|
410
|
+
}
|
411
|
+
|
412
|
+
if (c >= '0' && c <= '9') {
|
413
|
+
do {
|
414
|
+
RETURN_IF_EOF;
|
415
|
+
c = readChar(lexer, jsonText, offset);
|
416
|
+
} while (c >= '0' && c <= '9');
|
417
|
+
} else {
|
418
|
+
unreadChar(lexer, offset);
|
419
|
+
lexer->error = yajl_lex_missing_integer_after_exponent;
|
420
|
+
return yajl_tok_error;
|
421
|
+
}
|
422
|
+
tok = yajl_tok_double;
|
423
|
+
}
|
424
|
+
|
425
|
+
/* we always go "one too far" */
|
426
|
+
unreadChar(lexer, offset);
|
427
|
+
|
428
|
+
return tok;
|
429
|
+
}
|
430
|
+
|
431
|
+
static yajl_tok
|
432
|
+
yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
|
433
|
+
unsigned int jsonTextLen, unsigned int * offset)
|
434
|
+
{
|
435
|
+
unsigned char c;
|
436
|
+
|
437
|
+
yajl_tok tok = yajl_tok_comment;
|
438
|
+
|
439
|
+
RETURN_IF_EOF;
|
440
|
+
c = readChar(lexer, jsonText, offset);
|
441
|
+
|
442
|
+
/* either slash or star expected */
|
443
|
+
if (c == '/') {
|
444
|
+
/* now we throw away until end of line */
|
445
|
+
do {
|
446
|
+
RETURN_IF_EOF;
|
447
|
+
c = readChar(lexer, jsonText, offset);
|
448
|
+
} while (c != '\n');
|
449
|
+
} else if (c == '*') {
|
450
|
+
/* now we throw away until end of comment */
|
451
|
+
for (;;) {
|
452
|
+
RETURN_IF_EOF;
|
453
|
+
c = readChar(lexer, jsonText, offset);
|
454
|
+
if (c == '*') {
|
455
|
+
RETURN_IF_EOF;
|
456
|
+
c = readChar(lexer, jsonText, offset);
|
457
|
+
if (c == '/') {
|
458
|
+
break;
|
459
|
+
} else {
|
460
|
+
unreadChar(lexer, offset);
|
461
|
+
}
|
462
|
+
}
|
463
|
+
}
|
464
|
+
} else {
|
465
|
+
lexer->error = yajl_lex_invalid_char;
|
466
|
+
tok = yajl_tok_error;
|
467
|
+
}
|
468
|
+
|
469
|
+
return tok;
|
470
|
+
}
|
471
|
+
|
472
|
+
yajl_tok
|
473
|
+
yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
|
474
|
+
unsigned int jsonTextLen, unsigned int * offset,
|
475
|
+
const unsigned char ** outBuf, unsigned int * outLen)
|
476
|
+
{
|
477
|
+
yajl_tok tok = yajl_tok_error;
|
478
|
+
unsigned char c;
|
479
|
+
unsigned int startOffset = *offset;
|
480
|
+
|
481
|
+
*outBuf = NULL;
|
482
|
+
*outLen = 0;
|
483
|
+
|
484
|
+
for (;;) {
|
485
|
+
assert(*offset <= jsonTextLen);
|
486
|
+
|
487
|
+
if (*offset >= jsonTextLen) {
|
488
|
+
tok = yajl_tok_eof;
|
489
|
+
goto lexed;
|
490
|
+
}
|
491
|
+
|
492
|
+
c = readChar(lexer, jsonText, offset);
|
493
|
+
|
494
|
+
switch (c) {
|
495
|
+
case '{':
|
496
|
+
tok = yajl_tok_left_bracket;
|
497
|
+
goto lexed;
|
498
|
+
case '}':
|
499
|
+
tok = yajl_tok_right_bracket;
|
500
|
+
goto lexed;
|
501
|
+
case '[':
|
502
|
+
tok = yajl_tok_left_brace;
|
503
|
+
goto lexed;
|
504
|
+
case ']':
|
505
|
+
tok = yajl_tok_right_brace;
|
506
|
+
goto lexed;
|
507
|
+
case ',':
|
508
|
+
tok = yajl_tok_comma;
|
509
|
+
goto lexed;
|
510
|
+
case ':':
|
511
|
+
tok = yajl_tok_colon;
|
512
|
+
goto lexed;
|
513
|
+
case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
|
514
|
+
startOffset++;
|
515
|
+
break;
|
516
|
+
case 't': {
|
517
|
+
const char * want = "rue";
|
518
|
+
do {
|
519
|
+
if (*offset >= jsonTextLen) {
|
520
|
+
tok = yajl_tok_eof;
|
521
|
+
goto lexed;
|
522
|
+
}
|
523
|
+
c = readChar(lexer, jsonText, offset);
|
524
|
+
if (c != *want) {
|
525
|
+
unreadChar(lexer, offset);
|
526
|
+
lexer->error = yajl_lex_invalid_string;
|
527
|
+
tok = yajl_tok_error;
|
528
|
+
goto lexed;
|
529
|
+
}
|
530
|
+
} while (*(++want));
|
531
|
+
tok = yajl_tok_bool;
|
532
|
+
goto lexed;
|
533
|
+
}
|
534
|
+
case 'f': {
|
535
|
+
const char * want = "alse";
|
536
|
+
do {
|
537
|
+
if (*offset >= jsonTextLen) {
|
538
|
+
tok = yajl_tok_eof;
|
539
|
+
goto lexed;
|
540
|
+
}
|
541
|
+
c = readChar(lexer, jsonText, offset);
|
542
|
+
if (c != *want) {
|
543
|
+
unreadChar(lexer, offset);
|
544
|
+
lexer->error = yajl_lex_invalid_string;
|
545
|
+
tok = yajl_tok_error;
|
546
|
+
goto lexed;
|
547
|
+
}
|
548
|
+
} while (*(++want));
|
549
|
+
tok = yajl_tok_bool;
|
550
|
+
goto lexed;
|
551
|
+
}
|
552
|
+
case 'n': {
|
553
|
+
const char * want = "ull";
|
554
|
+
do {
|
555
|
+
if (*offset >= jsonTextLen) {
|
556
|
+
tok = yajl_tok_eof;
|
557
|
+
goto lexed;
|
558
|
+
}
|
559
|
+
c = readChar(lexer, jsonText, offset);
|
560
|
+
if (c != *want) {
|
561
|
+
unreadChar(lexer, offset);
|
562
|
+
lexer->error = yajl_lex_invalid_string;
|
563
|
+
tok = yajl_tok_error;
|
564
|
+
goto lexed;
|
565
|
+
}
|
566
|
+
} while (*(++want));
|
567
|
+
tok = yajl_tok_null;
|
568
|
+
goto lexed;
|
569
|
+
}
|
570
|
+
case '"': {
|
571
|
+
tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
|
572
|
+
jsonTextLen, offset);
|
573
|
+
goto lexed;
|
574
|
+
}
|
575
|
+
case '-':
|
576
|
+
case '0': case '1': case '2': case '3': case '4':
|
577
|
+
case '5': case '6': case '7': case '8': case '9': {
|
578
|
+
/* integer parsing wants to start from the beginning */
|
579
|
+
unreadChar(lexer, offset);
|
580
|
+
tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
|
581
|
+
jsonTextLen, offset);
|
582
|
+
goto lexed;
|
583
|
+
}
|
584
|
+
case '/':
|
585
|
+
/* hey, look, a probable comment! If comments are disabled
|
586
|
+
* it's an error. */
|
587
|
+
if (!lexer->allowComments) {
|
588
|
+
unreadChar(lexer, offset);
|
589
|
+
lexer->error = yajl_lex_unallowed_comment;
|
590
|
+
tok = yajl_tok_error;
|
591
|
+
goto lexed;
|
592
|
+
}
|
593
|
+
/* if comments are enabled, then we should try to lex
|
594
|
+
* the thing. possible outcomes are
|
595
|
+
* - successful lex (tok_comment, which means continue),
|
596
|
+
* - malformed comment opening (slash not followed by
|
597
|
+
* '*' or '/') (tok_error)
|
598
|
+
* - eof hit. (tok_eof) */
|
599
|
+
tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
|
600
|
+
jsonTextLen, offset);
|
601
|
+
if (tok == yajl_tok_comment) {
|
602
|
+
/* "error" is silly, but that's the initial
|
603
|
+
* state of tok. guilty until proven innocent. */
|
604
|
+
tok = yajl_tok_error;
|
605
|
+
yajl_buf_clear(lexer->buf);
|
606
|
+
lexer->bufInUse = 0;
|
607
|
+
startOffset = *offset;
|
608
|
+
break;
|
609
|
+
}
|
610
|
+
/* hit error or eof, bail */
|
611
|
+
goto lexed;
|
612
|
+
default:
|
613
|
+
lexer->error = yajl_lex_invalid_char;
|
614
|
+
tok = yajl_tok_error;
|
615
|
+
goto lexed;
|
616
|
+
}
|
617
|
+
}
|
618
|
+
|
619
|
+
|
620
|
+
lexed:
|
621
|
+
/* need to append to buffer if the buffer is in use or
|
622
|
+
* if it's an EOF token */
|
623
|
+
if (tok == yajl_tok_eof || lexer->bufInUse) {
|
624
|
+
if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
|
625
|
+
lexer->bufInUse = 1;
|
626
|
+
yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
|
627
|
+
lexer->bufOff = 0;
|
628
|
+
|
629
|
+
if (tok != yajl_tok_eof) {
|
630
|
+
*outBuf = yajl_buf_data(lexer->buf);
|
631
|
+
*outLen = yajl_buf_len(lexer->buf);
|
632
|
+
lexer->bufInUse = 0;
|
633
|
+
}
|
634
|
+
} else if (tok != yajl_tok_error) {
|
635
|
+
*outBuf = jsonText + startOffset;
|
636
|
+
*outLen = *offset - startOffset;
|
637
|
+
}
|
638
|
+
|
639
|
+
/* special case for strings. skip the quotes. */
|
640
|
+
if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
|
641
|
+
{
|
642
|
+
assert(*outLen >= 2);
|
643
|
+
(*outBuf)++;
|
644
|
+
*outLen -= 2;
|
645
|
+
}
|
646
|
+
|
647
|
+
|
648
|
+
#ifdef YAJL_LEXER_DEBUG
|
649
|
+
if (tok == yajl_tok_error) {
|
650
|
+
printf("lexical error: %s\n",
|
651
|
+
yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
|
652
|
+
} else if (tok == yajl_tok_eof) {
|
653
|
+
printf("EOF hit\n");
|
654
|
+
} else {
|
655
|
+
printf("lexed %s: '", tokToStr(tok));
|
656
|
+
fwrite(*outBuf, 1, *outLen, stdout);
|
657
|
+
printf("'\n");
|
658
|
+
}
|
659
|
+
#endif
|
660
|
+
|
661
|
+
return tok;
|
662
|
+
}
|
663
|
+
|
664
|
+
const char *
|
665
|
+
yajl_lex_error_to_string(yajl_lex_error error)
|
666
|
+
{
|
667
|
+
switch (error) {
|
668
|
+
case yajl_lex_e_ok:
|
669
|
+
return "ok, no error";
|
670
|
+
case yajl_lex_string_invalid_utf8:
|
671
|
+
return "invalid bytes in UTF8 string.";
|
672
|
+
case yajl_lex_string_invalid_escaped_char:
|
673
|
+
return "inside a string, '\\' occurs before a character "
|
674
|
+
"which it may not.";
|
675
|
+
case yajl_lex_string_invalid_json_char:
|
676
|
+
return "invalid character inside string.";
|
677
|
+
case yajl_lex_string_invalid_hex_char:
|
678
|
+
return "invalid (non-hex) character occurs after '\\u' inside "
|
679
|
+
"string.";
|
680
|
+
case yajl_lex_invalid_char:
|
681
|
+
return "invalid char in json text.";
|
682
|
+
case yajl_lex_invalid_string:
|
683
|
+
return "invalid string in json text.";
|
684
|
+
case yajl_lex_missing_integer_after_exponent:
|
685
|
+
return "malformed number, a digit is required after the exponent.";
|
686
|
+
case yajl_lex_missing_integer_after_decimal:
|
687
|
+
return "malformed number, a digit is required after the "
|
688
|
+
"decimal point.";
|
689
|
+
case yajl_lex_missing_integer_after_minus:
|
690
|
+
return "malformed number, a digit is required after the "
|
691
|
+
"minus sign.";
|
692
|
+
case yajl_lex_unallowed_comment:
|
693
|
+
return "probable comment found in input text, comments are "
|
694
|
+
"not enabled.";
|
695
|
+
}
|
696
|
+
return "unknown error code";
|
697
|
+
}
|
698
|
+
|
699
|
+
|
700
|
+
/** allows access to more specific information about the lexical
|
701
|
+
* error when yajl_lex_lex returns yajl_tok_error. */
|
702
|
+
yajl_lex_error
|
703
|
+
yajl_lex_get_error(yajl_lexer lexer)
|
704
|
+
{
|
705
|
+
if (lexer == NULL) return (yajl_lex_error) -1;
|
706
|
+
return lexer->error;
|
707
|
+
}
|
708
|
+
|
709
|
+
unsigned int yajl_lex_current_line(yajl_lexer lexer)
|
710
|
+
{
|
711
|
+
return lexer->lineOff;
|
712
|
+
}
|
713
|
+
|
714
|
+
unsigned int yajl_lex_current_char(yajl_lexer lexer)
|
715
|
+
{
|
716
|
+
return lexer->charOff;
|
717
|
+
}
|
718
|
+
|
719
|
+
yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
|
720
|
+
unsigned int jsonTextLen, unsigned int offset)
|
721
|
+
{
|
722
|
+
const unsigned char * outBuf;
|
723
|
+
unsigned int outLen;
|
724
|
+
unsigned int bufLen = yajl_buf_len(lexer->buf);
|
725
|
+
unsigned int bufOff = lexer->bufOff;
|
726
|
+
unsigned int bufInUse = lexer->bufInUse;
|
727
|
+
yajl_tok tok;
|
728
|
+
|
729
|
+
tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
|
730
|
+
&outBuf, &outLen);
|
731
|
+
|
732
|
+
lexer->bufOff = bufOff;
|
733
|
+
lexer->bufInUse = bufInUse;
|
734
|
+
yajl_buf_truncate(lexer->buf, bufLen);
|
735
|
+
|
736
|
+
return tok;
|
737
|
+
}
|