yaji 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,737 @@
1
+ /*
2
+ * Copyright 2010, Lloyd Hilaiel.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions are
6
+ * met:
7
+ *
8
+ * 1. Redistributions of source code must retain the above copyright
9
+ * notice, this list of conditions and the following disclaimer.
10
+ *
11
+ * 2. Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in
13
+ * the documentation and/or other materials provided with the
14
+ * distribution.
15
+ *
16
+ * 3. Neither the name of Lloyd Hilaiel nor the names of its
17
+ * contributors may be used to endorse or promote products derived
18
+ * from this software without specific prior written permission.
19
+ *
20
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
24
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
+ * POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+ #include "yajl_lex.h"
34
+ #include "yajl_buf.h"
35
+
36
+ #include <stdlib.h>
37
+ #include <stdio.h>
38
+ #include <assert.h>
39
+ #include <string.h>
40
+
41
+ #ifdef YAJL_LEXER_DEBUG
42
+ static const char *
43
+ tokToStr(yajl_tok tok)
44
+ {
45
+ switch (tok) {
46
+ case yajl_tok_bool: return "bool";
47
+ case yajl_tok_colon: return "colon";
48
+ case yajl_tok_comma: return "comma";
49
+ case yajl_tok_eof: return "eof";
50
+ case yajl_tok_error: return "error";
51
+ case yajl_tok_left_brace: return "brace";
52
+ case yajl_tok_left_bracket: return "bracket";
53
+ case yajl_tok_null: return "null";
54
+ case yajl_tok_integer: return "integer";
55
+ case yajl_tok_double: return "double";
56
+ case yajl_tok_right_brace: return "brace";
57
+ case yajl_tok_right_bracket: return "bracket";
58
+ case yajl_tok_string: return "string";
59
+ case yajl_tok_string_with_escapes: return "string_with_escapes";
60
+ }
61
+ return "unknown";
62
+ }
63
+ #endif
64
+
65
+ /* Impact of the stream parsing feature on the lexer:
66
+ *
67
+ * YAJL support stream parsing. That is, the ability to parse the first
68
+ * bits of a chunk of JSON before the last bits are available (still on
69
+ * the network or disk). This makes the lexer more complex. The
70
+ * responsibility of the lexer is to handle transparently the case where
71
+ * a chunk boundary falls in the middle of a token. This is
72
+ * accomplished is via a buffer and a character reading abstraction.
73
+ *
74
+ * Overview of implementation
75
+ *
76
+ * When we lex to end of input string before end of token is hit, we
77
+ * copy all of the input text composing the token into our lexBuf.
78
+ *
79
+ * Every time we read a character, we do so through the readChar function.
80
+ * readChar's responsibility is to handle pulling all chars from the buffer
81
+ * before pulling chars from input text
82
+ */
83
+
84
+ struct yajl_lexer_t {
85
+ /* the overal line and char offset into the data */
86
+ unsigned int lineOff;
87
+ unsigned int charOff;
88
+
89
+ /* error */
90
+ yajl_lex_error error;
91
+
92
+ /* a input buffer to handle the case where a token is spread over
93
+ * multiple chunks */
94
+ yajl_buf buf;
95
+
96
+ /* in the case where we have data in the lexBuf, bufOff holds
97
+ * the current offset into the lexBuf. */
98
+ unsigned int bufOff;
99
+
100
+ /* are we using the lex buf? */
101
+ unsigned int bufInUse;
102
+
103
+ /* shall we allow comments? */
104
+ unsigned int allowComments;
105
+
106
+ /* shall we validate utf8 inside strings? */
107
+ unsigned int validateUTF8;
108
+
109
+ yajl_alloc_funcs * alloc;
110
+ };
111
+
112
+ #define readChar(lxr, txt, off) \
113
+ (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
114
+ (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
115
+ ((txt)[(*(off))++]))
116
+
117
+ #define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
118
+
119
+ yajl_lexer
120
+ yajl_lex_alloc(yajl_alloc_funcs * alloc,
121
+ unsigned int allowComments, unsigned int validateUTF8)
122
+ {
123
+ yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
124
+ memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
125
+ lxr->buf = yajl_buf_alloc(alloc);
126
+ lxr->allowComments = allowComments;
127
+ lxr->validateUTF8 = validateUTF8;
128
+ lxr->alloc = alloc;
129
+ return lxr;
130
+ }
131
+
132
+ void
133
+ yajl_lex_free(yajl_lexer lxr)
134
+ {
135
+ yajl_buf_free(lxr->buf);
136
+ YA_FREE(lxr->alloc, lxr);
137
+ return;
138
+ }
139
+
140
+ /* a lookup table which lets us quickly determine three things:
141
+ * VEC - valid escaped conrol char
142
+ * IJC - invalid json char
143
+ * VHC - valid hex char
144
+ * note. the solidus '/' may be escaped or not.
145
+ * note. the
146
+ */
147
+ #define VEC 1
148
+ #define IJC 2
149
+ #define VHC 4
150
+ static const char charLookupTable[256] =
151
+ {
152
+ /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
153
+ /*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
154
+ /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
155
+ /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
156
+
157
+ /*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
158
+ /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
159
+ /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
160
+ /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
161
+
162
+ /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
163
+ /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
164
+ /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
165
+ /*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 ,
166
+
167
+ /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
168
+ /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
169
+ /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
170
+ /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
171
+
172
+ /* include these so we don't have to always check the range of the char */
173
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
174
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
175
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
176
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
177
+
178
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
179
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
180
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
181
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
182
+
183
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
184
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
185
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
186
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
187
+
188
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
189
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
190
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
191
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
192
+ };
193
+
194
+ /** process a variable length utf8 encoded codepoint.
195
+ *
196
+ * returns:
197
+ * yajl_tok_string - if valid utf8 char was parsed and offset was
198
+ * advanced
199
+ * yajl_tok_eof - if end of input was hit before validation could
200
+ * complete
201
+ * yajl_tok_error - if invalid utf8 was encountered
202
+ *
203
+ * NOTE: on error the offset will point to the first char of the
204
+ * invalid utf8 */
205
+ #define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
206
+
207
+ static yajl_tok
208
+ yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
209
+ unsigned int jsonTextLen, unsigned int * offset,
210
+ unsigned char curChar)
211
+ {
212
+ if (curChar <= 0x7f) {
213
+ /* single byte */
214
+ return yajl_tok_string;
215
+ } else if ((curChar >> 5) == 0x6) {
216
+ /* two byte */
217
+ UTF8_CHECK_EOF;
218
+ curChar = readChar(lexer, jsonText, offset);
219
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
220
+ } else if ((curChar >> 4) == 0x0e) {
221
+ /* three byte */
222
+ UTF8_CHECK_EOF;
223
+ curChar = readChar(lexer, jsonText, offset);
224
+ if ((curChar >> 6) == 0x2) {
225
+ UTF8_CHECK_EOF;
226
+ curChar = readChar(lexer, jsonText, offset);
227
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
228
+ }
229
+ } else if ((curChar >> 3) == 0x1e) {
230
+ /* four byte */
231
+ UTF8_CHECK_EOF;
232
+ curChar = readChar(lexer, jsonText, offset);
233
+ if ((curChar >> 6) == 0x2) {
234
+ UTF8_CHECK_EOF;
235
+ curChar = readChar(lexer, jsonText, offset);
236
+ if ((curChar >> 6) == 0x2) {
237
+ UTF8_CHECK_EOF;
238
+ curChar = readChar(lexer, jsonText, offset);
239
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
240
+ }
241
+ }
242
+ }
243
+
244
+ return yajl_tok_error;
245
+ }
246
+
247
+ /* lex a string. input is the lexer, pointer to beginning of
248
+ * json text, and start of string (offset).
249
+ * a token is returned which has the following meanings:
250
+ * yajl_tok_string: lex of string was successful. offset points to
251
+ * terminating '"'.
252
+ * yajl_tok_eof: end of text was encountered before we could complete
253
+ * the lex.
254
+ * yajl_tok_error: embedded in the string were unallowable chars. offset
255
+ * points to the offending char
256
+ */
257
+ #define STR_CHECK_EOF \
258
+ if (*offset >= jsonTextLen) { \
259
+ tok = yajl_tok_eof; \
260
+ goto finish_string_lex; \
261
+ }
262
+
263
+ static yajl_tok
264
+ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
265
+ unsigned int jsonTextLen, unsigned int * offset)
266
+ {
267
+ yajl_tok tok = yajl_tok_error;
268
+ int hasEscapes = 0;
269
+
270
+ for (;;) {
271
+ unsigned char curChar;
272
+
273
+ STR_CHECK_EOF;
274
+
275
+ curChar = readChar(lexer, jsonText, offset);
276
+
277
+ /* quote terminates */
278
+ if (curChar == '"') {
279
+ tok = yajl_tok_string;
280
+ break;
281
+ }
282
+ /* backslash escapes a set of control chars, */
283
+ else if (curChar == '\\') {
284
+ hasEscapes = 1;
285
+ STR_CHECK_EOF;
286
+
287
+ /* special case \u */
288
+ curChar = readChar(lexer, jsonText, offset);
289
+ if (curChar == 'u') {
290
+ unsigned int i = 0;
291
+
292
+ for (i=0;i<4;i++) {
293
+ STR_CHECK_EOF;
294
+ curChar = readChar(lexer, jsonText, offset);
295
+ if (!(charLookupTable[curChar] & VHC)) {
296
+ /* back up to offending char */
297
+ unreadChar(lexer, offset);
298
+ lexer->error = yajl_lex_string_invalid_hex_char;
299
+ goto finish_string_lex;
300
+ }
301
+ }
302
+ } else if (!(charLookupTable[curChar] & VEC)) {
303
+ /* back up to offending char */
304
+ unreadChar(lexer, offset);
305
+ lexer->error = yajl_lex_string_invalid_escaped_char;
306
+ goto finish_string_lex;
307
+ }
308
+ }
309
+ /* when not validating UTF8 it's a simple table lookup to determine
310
+ * if the present character is invalid */
311
+ else if(charLookupTable[curChar] & IJC) {
312
+ /* back up to offending char */
313
+ unreadChar(lexer, offset);
314
+ lexer->error = yajl_lex_string_invalid_json_char;
315
+ goto finish_string_lex;
316
+ }
317
+ /* when in validate UTF8 mode we need to do some extra work */
318
+ else if (lexer->validateUTF8) {
319
+ yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
320
+ offset, curChar);
321
+
322
+ if (t == yajl_tok_eof) {
323
+ tok = yajl_tok_eof;
324
+ goto finish_string_lex;
325
+ } else if (t == yajl_tok_error) {
326
+ lexer->error = yajl_lex_string_invalid_utf8;
327
+ goto finish_string_lex;
328
+ }
329
+ }
330
+ /* accept it, and move on */
331
+ }
332
+ finish_string_lex:
333
+ /* tell our buddy, the parser, wether he needs to process this string
334
+ * again */
335
+ if (hasEscapes && tok == yajl_tok_string) {
336
+ tok = yajl_tok_string_with_escapes;
337
+ }
338
+
339
+ return tok;
340
+ }
341
+
342
+ #define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
343
+
344
+ static yajl_tok
345
+ yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
346
+ unsigned int jsonTextLen, unsigned int * offset)
347
+ {
348
+ /** XXX: numbers are the only entities in json that we must lex
349
+ * _beyond_ in order to know that they are complete. There
350
+ * is an ambiguous case for integers at EOF. */
351
+
352
+ unsigned char c;
353
+
354
+ yajl_tok tok = yajl_tok_integer;
355
+
356
+ RETURN_IF_EOF;
357
+ c = readChar(lexer, jsonText, offset);
358
+
359
+ /* optional leading minus */
360
+ if (c == '-') {
361
+ RETURN_IF_EOF;
362
+ c = readChar(lexer, jsonText, offset);
363
+ }
364
+
365
+ /* a single zero, or a series of integers */
366
+ if (c == '0') {
367
+ RETURN_IF_EOF;
368
+ c = readChar(lexer, jsonText, offset);
369
+ } else if (c >= '1' && c <= '9') {
370
+ do {
371
+ RETURN_IF_EOF;
372
+ c = readChar(lexer, jsonText, offset);
373
+ } while (c >= '0' && c <= '9');
374
+ } else {
375
+ unreadChar(lexer, offset);
376
+ lexer->error = yajl_lex_missing_integer_after_minus;
377
+ return yajl_tok_error;
378
+ }
379
+
380
+ /* optional fraction (indicates this is floating point) */
381
+ if (c == '.') {
382
+ int numRd = 0;
383
+
384
+ RETURN_IF_EOF;
385
+ c = readChar(lexer, jsonText, offset);
386
+
387
+ while (c >= '0' && c <= '9') {
388
+ numRd++;
389
+ RETURN_IF_EOF;
390
+ c = readChar(lexer, jsonText, offset);
391
+ }
392
+
393
+ if (!numRd) {
394
+ unreadChar(lexer, offset);
395
+ lexer->error = yajl_lex_missing_integer_after_decimal;
396
+ return yajl_tok_error;
397
+ }
398
+ tok = yajl_tok_double;
399
+ }
400
+
401
+ /* optional exponent (indicates this is floating point) */
402
+ if (c == 'e' || c == 'E') {
403
+ RETURN_IF_EOF;
404
+ c = readChar(lexer, jsonText, offset);
405
+
406
+ /* optional sign */
407
+ if (c == '+' || c == '-') {
408
+ RETURN_IF_EOF;
409
+ c = readChar(lexer, jsonText, offset);
410
+ }
411
+
412
+ if (c >= '0' && c <= '9') {
413
+ do {
414
+ RETURN_IF_EOF;
415
+ c = readChar(lexer, jsonText, offset);
416
+ } while (c >= '0' && c <= '9');
417
+ } else {
418
+ unreadChar(lexer, offset);
419
+ lexer->error = yajl_lex_missing_integer_after_exponent;
420
+ return yajl_tok_error;
421
+ }
422
+ tok = yajl_tok_double;
423
+ }
424
+
425
+ /* we always go "one too far" */
426
+ unreadChar(lexer, offset);
427
+
428
+ return tok;
429
+ }
430
+
431
+ static yajl_tok
432
+ yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
433
+ unsigned int jsonTextLen, unsigned int * offset)
434
+ {
435
+ unsigned char c;
436
+
437
+ yajl_tok tok = yajl_tok_comment;
438
+
439
+ RETURN_IF_EOF;
440
+ c = readChar(lexer, jsonText, offset);
441
+
442
+ /* either slash or star expected */
443
+ if (c == '/') {
444
+ /* now we throw away until end of line */
445
+ do {
446
+ RETURN_IF_EOF;
447
+ c = readChar(lexer, jsonText, offset);
448
+ } while (c != '\n');
449
+ } else if (c == '*') {
450
+ /* now we throw away until end of comment */
451
+ for (;;) {
452
+ RETURN_IF_EOF;
453
+ c = readChar(lexer, jsonText, offset);
454
+ if (c == '*') {
455
+ RETURN_IF_EOF;
456
+ c = readChar(lexer, jsonText, offset);
457
+ if (c == '/') {
458
+ break;
459
+ } else {
460
+ unreadChar(lexer, offset);
461
+ }
462
+ }
463
+ }
464
+ } else {
465
+ lexer->error = yajl_lex_invalid_char;
466
+ tok = yajl_tok_error;
467
+ }
468
+
469
+ return tok;
470
+ }
471
+
472
+ yajl_tok
473
+ yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
474
+ unsigned int jsonTextLen, unsigned int * offset,
475
+ const unsigned char ** outBuf, unsigned int * outLen)
476
+ {
477
+ yajl_tok tok = yajl_tok_error;
478
+ unsigned char c;
479
+ unsigned int startOffset = *offset;
480
+
481
+ *outBuf = NULL;
482
+ *outLen = 0;
483
+
484
+ for (;;) {
485
+ assert(*offset <= jsonTextLen);
486
+
487
+ if (*offset >= jsonTextLen) {
488
+ tok = yajl_tok_eof;
489
+ goto lexed;
490
+ }
491
+
492
+ c = readChar(lexer, jsonText, offset);
493
+
494
+ switch (c) {
495
+ case '{':
496
+ tok = yajl_tok_left_bracket;
497
+ goto lexed;
498
+ case '}':
499
+ tok = yajl_tok_right_bracket;
500
+ goto lexed;
501
+ case '[':
502
+ tok = yajl_tok_left_brace;
503
+ goto lexed;
504
+ case ']':
505
+ tok = yajl_tok_right_brace;
506
+ goto lexed;
507
+ case ',':
508
+ tok = yajl_tok_comma;
509
+ goto lexed;
510
+ case ':':
511
+ tok = yajl_tok_colon;
512
+ goto lexed;
513
+ case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
514
+ startOffset++;
515
+ break;
516
+ case 't': {
517
+ const char * want = "rue";
518
+ do {
519
+ if (*offset >= jsonTextLen) {
520
+ tok = yajl_tok_eof;
521
+ goto lexed;
522
+ }
523
+ c = readChar(lexer, jsonText, offset);
524
+ if (c != *want) {
525
+ unreadChar(lexer, offset);
526
+ lexer->error = yajl_lex_invalid_string;
527
+ tok = yajl_tok_error;
528
+ goto lexed;
529
+ }
530
+ } while (*(++want));
531
+ tok = yajl_tok_bool;
532
+ goto lexed;
533
+ }
534
+ case 'f': {
535
+ const char * want = "alse";
536
+ do {
537
+ if (*offset >= jsonTextLen) {
538
+ tok = yajl_tok_eof;
539
+ goto lexed;
540
+ }
541
+ c = readChar(lexer, jsonText, offset);
542
+ if (c != *want) {
543
+ unreadChar(lexer, offset);
544
+ lexer->error = yajl_lex_invalid_string;
545
+ tok = yajl_tok_error;
546
+ goto lexed;
547
+ }
548
+ } while (*(++want));
549
+ tok = yajl_tok_bool;
550
+ goto lexed;
551
+ }
552
+ case 'n': {
553
+ const char * want = "ull";
554
+ do {
555
+ if (*offset >= jsonTextLen) {
556
+ tok = yajl_tok_eof;
557
+ goto lexed;
558
+ }
559
+ c = readChar(lexer, jsonText, offset);
560
+ if (c != *want) {
561
+ unreadChar(lexer, offset);
562
+ lexer->error = yajl_lex_invalid_string;
563
+ tok = yajl_tok_error;
564
+ goto lexed;
565
+ }
566
+ } while (*(++want));
567
+ tok = yajl_tok_null;
568
+ goto lexed;
569
+ }
570
+ case '"': {
571
+ tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
572
+ jsonTextLen, offset);
573
+ goto lexed;
574
+ }
575
+ case '-':
576
+ case '0': case '1': case '2': case '3': case '4':
577
+ case '5': case '6': case '7': case '8': case '9': {
578
+ /* integer parsing wants to start from the beginning */
579
+ unreadChar(lexer, offset);
580
+ tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
581
+ jsonTextLen, offset);
582
+ goto lexed;
583
+ }
584
+ case '/':
585
+ /* hey, look, a probable comment! If comments are disabled
586
+ * it's an error. */
587
+ if (!lexer->allowComments) {
588
+ unreadChar(lexer, offset);
589
+ lexer->error = yajl_lex_unallowed_comment;
590
+ tok = yajl_tok_error;
591
+ goto lexed;
592
+ }
593
+ /* if comments are enabled, then we should try to lex
594
+ * the thing. possible outcomes are
595
+ * - successful lex (tok_comment, which means continue),
596
+ * - malformed comment opening (slash not followed by
597
+ * '*' or '/') (tok_error)
598
+ * - eof hit. (tok_eof) */
599
+ tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
600
+ jsonTextLen, offset);
601
+ if (tok == yajl_tok_comment) {
602
+ /* "error" is silly, but that's the initial
603
+ * state of tok. guilty until proven innocent. */
604
+ tok = yajl_tok_error;
605
+ yajl_buf_clear(lexer->buf);
606
+ lexer->bufInUse = 0;
607
+ startOffset = *offset;
608
+ break;
609
+ }
610
+ /* hit error or eof, bail */
611
+ goto lexed;
612
+ default:
613
+ lexer->error = yajl_lex_invalid_char;
614
+ tok = yajl_tok_error;
615
+ goto lexed;
616
+ }
617
+ }
618
+
619
+
620
+ lexed:
621
+ /* need to append to buffer if the buffer is in use or
622
+ * if it's an EOF token */
623
+ if (tok == yajl_tok_eof || lexer->bufInUse) {
624
+ if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
625
+ lexer->bufInUse = 1;
626
+ yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
627
+ lexer->bufOff = 0;
628
+
629
+ if (tok != yajl_tok_eof) {
630
+ *outBuf = yajl_buf_data(lexer->buf);
631
+ *outLen = yajl_buf_len(lexer->buf);
632
+ lexer->bufInUse = 0;
633
+ }
634
+ } else if (tok != yajl_tok_error) {
635
+ *outBuf = jsonText + startOffset;
636
+ *outLen = *offset - startOffset;
637
+ }
638
+
639
+ /* special case for strings. skip the quotes. */
640
+ if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
641
+ {
642
+ assert(*outLen >= 2);
643
+ (*outBuf)++;
644
+ *outLen -= 2;
645
+ }
646
+
647
+
648
+ #ifdef YAJL_LEXER_DEBUG
649
+ if (tok == yajl_tok_error) {
650
+ printf("lexical error: %s\n",
651
+ yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
652
+ } else if (tok == yajl_tok_eof) {
653
+ printf("EOF hit\n");
654
+ } else {
655
+ printf("lexed %s: '", tokToStr(tok));
656
+ fwrite(*outBuf, 1, *outLen, stdout);
657
+ printf("'\n");
658
+ }
659
+ #endif
660
+
661
+ return tok;
662
+ }
663
+
664
+ const char *
665
+ yajl_lex_error_to_string(yajl_lex_error error)
666
+ {
667
+ switch (error) {
668
+ case yajl_lex_e_ok:
669
+ return "ok, no error";
670
+ case yajl_lex_string_invalid_utf8:
671
+ return "invalid bytes in UTF8 string.";
672
+ case yajl_lex_string_invalid_escaped_char:
673
+ return "inside a string, '\\' occurs before a character "
674
+ "which it may not.";
675
+ case yajl_lex_string_invalid_json_char:
676
+ return "invalid character inside string.";
677
+ case yajl_lex_string_invalid_hex_char:
678
+ return "invalid (non-hex) character occurs after '\\u' inside "
679
+ "string.";
680
+ case yajl_lex_invalid_char:
681
+ return "invalid char in json text.";
682
+ case yajl_lex_invalid_string:
683
+ return "invalid string in json text.";
684
+ case yajl_lex_missing_integer_after_exponent:
685
+ return "malformed number, a digit is required after the exponent.";
686
+ case yajl_lex_missing_integer_after_decimal:
687
+ return "malformed number, a digit is required after the "
688
+ "decimal point.";
689
+ case yajl_lex_missing_integer_after_minus:
690
+ return "malformed number, a digit is required after the "
691
+ "minus sign.";
692
+ case yajl_lex_unallowed_comment:
693
+ return "probable comment found in input text, comments are "
694
+ "not enabled.";
695
+ }
696
+ return "unknown error code";
697
+ }
698
+
699
+
700
+ /** allows access to more specific information about the lexical
701
+ * error when yajl_lex_lex returns yajl_tok_error. */
702
+ yajl_lex_error
703
+ yajl_lex_get_error(yajl_lexer lexer)
704
+ {
705
+ if (lexer == NULL) return (yajl_lex_error) -1;
706
+ return lexer->error;
707
+ }
708
+
709
+ unsigned int yajl_lex_current_line(yajl_lexer lexer)
710
+ {
711
+ return lexer->lineOff;
712
+ }
713
+
714
+ unsigned int yajl_lex_current_char(yajl_lexer lexer)
715
+ {
716
+ return lexer->charOff;
717
+ }
718
+
719
+ yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
720
+ unsigned int jsonTextLen, unsigned int offset)
721
+ {
722
+ const unsigned char * outBuf;
723
+ unsigned int outLen;
724
+ unsigned int bufLen = yajl_buf_len(lexer->buf);
725
+ unsigned int bufOff = lexer->bufOff;
726
+ unsigned int bufInUse = lexer->bufInUse;
727
+ yajl_tok tok;
728
+
729
+ tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
730
+ &outBuf, &outLen);
731
+
732
+ lexer->bufOff = bufOff;
733
+ lexer->bufInUse = bufInUse;
734
+ yajl_buf_truncate(lexer->buf, bufLen);
735
+
736
+ return tok;
737
+ }