smtlaissezfaire-gazelle 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,813 @@
1
+ /*********************************************************************
2
+
3
+ Gazelle: a system for building fast, reusable parsers
4
+
5
+ interpreter.c
6
+
7
+ Once a compiled grammar has been loaded into memory, the routines
8
+ in this file are what actually does the parsing. This file is an
9
+ "interpreter" in the sense that it parses the input by using the
10
+ grammar as a data structure -- no grammar-specific code is ever
11
+ generated or executed. Despite this, it is still quite fast, and
12
+ has a very low memory footprint.
13
+
14
+ The interpreter primarily consists of maintaining the parse stack
15
+ properly and transitioning the frames in response to the input.
16
+
17
+ Copyright (c) 2007-2009 Joshua Haberman. See LICENSE for details.
18
+
19
+ *********************************************************************/
20
+
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <assert.h>
24
+ #include <string.h>
25
+
26
+ #include "gazelle/parse.h"
27
+
28
+ /*
29
+ * The following are stack-manipulation functions. Gazelle maintains a runtime
30
+ * stack (which is completely separate from the C stack), and these functions
31
+ * provide pushing and popping of different kinds of stack frames.
32
+ */
33
+
34
+ static
35
+ struct gzl_parse_stack_frame *push_empty_frame(struct gzl_parse_state *s,
36
+ enum gzl_frame_type frame_type,
37
+ struct gzl_offset *start_offset)
38
+ {
39
+ RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len+1);
40
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
41
+ frame->frame_type = frame_type;
42
+ frame->start_offset = *start_offset;
43
+ return frame;
44
+ }
45
+
46
+ static
47
+ struct gzl_intfa_frame *push_intfa_frame(struct gzl_parse_state *s,
48
+ struct gzl_intfa *intfa,
49
+ struct gzl_offset *start_offset)
50
+ {
51
+ struct gzl_parse_stack_frame *frame =
52
+ push_empty_frame(s, GZL_FRAME_TYPE_INTFA, start_offset);
53
+ struct gzl_intfa_frame *intfa_frame = &frame->f.intfa_frame;
54
+ intfa_frame->intfa = intfa;
55
+ intfa_frame->intfa_state = &intfa->states[0];
56
+ return intfa_frame;
57
+ }
58
+
59
+ static
60
+ struct gzl_parse_stack_frame *push_gla_frame(struct gzl_parse_state *s,
61
+ struct gzl_gla *gla,
62
+ struct gzl_offset *start_offset)
63
+ {
64
+ struct gzl_parse_stack_frame *frame =
65
+ push_empty_frame(s, GZL_FRAME_TYPE_GLA, start_offset);
66
+ struct gzl_gla_frame *gla_frame = &frame->f.gla_frame;
67
+ gla_frame->gla = gla;
68
+ gla_frame->gla_state = &gla->states[0];
69
+ return frame;
70
+ }
71
+
72
+ static
73
+ enum gzl_status push_rtn_frame(struct gzl_parse_state *s,
74
+ struct gzl_rtn *rtn,
75
+ struct gzl_offset *start_offset)
76
+ {
77
+ struct gzl_parse_stack_frame *new_frame =
78
+ push_empty_frame(s, GZL_FRAME_TYPE_RTN, start_offset);
79
+ struct gzl_rtn_frame *new_rtn_frame = &new_frame->f.rtn_frame;
80
+ new_rtn_frame->rtn = rtn;
81
+ new_rtn_frame->rtn_transition = NULL;
82
+ new_rtn_frame->rtn_state = &new_rtn_frame->rtn->states[0];
83
+ if(s->bound_grammar->start_rule_cb) s->bound_grammar->start_rule_cb(s);
84
+ return GZL_STATUS_OK;
85
+ }
86
+
87
+ static
88
+ enum gzl_status push_rtn_frame_for_transition(struct gzl_parse_state *s,
89
+ struct gzl_rtn_transition *t,
90
+ struct gzl_offset *start_offset)
91
+ {
92
+ struct gzl_rtn_frame *old_rtn_frame =
93
+ &DYNARRAY_GET_TOP(s->parse_stack)->f.rtn_frame;
94
+ old_rtn_frame->rtn_transition = t;
95
+ return push_rtn_frame(s, t->edge.nonterminal, start_offset);
96
+ }
97
+
98
+ static
99
+ struct gzl_parse_stack_frame *pop_frame(struct gzl_parse_state *s)
100
+ {
101
+ assert(s->parse_stack_len > 0);
102
+ RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len-1);
103
+ return s->parse_stack_len > 0 ? DYNARRAY_GET_TOP(s->parse_stack) : NULL;
104
+ }
105
+
106
+ static
107
+ enum gzl_status pop_rtn_frame(struct gzl_parse_state *s)
108
+ {
109
+ assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == GZL_FRAME_TYPE_RTN);
110
+ if(s->bound_grammar->end_rule_cb) s->bound_grammar->end_rule_cb(s);
111
+
112
+ struct gzl_parse_stack_frame *frame = pop_frame(s);
113
+ if(frame) {
114
+ assert(frame->frame_type == GZL_FRAME_TYPE_RTN);
115
+ struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
116
+ if(rtn_frame->rtn_transition)
117
+ rtn_frame->rtn_state = rtn_frame->rtn_transition->dest_state;
118
+ else {
119
+ /* Should only happen at the top level. */
120
+ assert(s->parse_stack_len == 1);
121
+ }
122
+ return GZL_STATUS_OK;
123
+ } else
124
+ return GZL_STATUS_HARD_EOF;
125
+ }
126
+
127
+ static
128
+ struct gzl_parse_stack_frame *pop_gla_frame(struct gzl_parse_state *s)
129
+ {
130
+ assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == GZL_FRAME_TYPE_GLA);
131
+ return pop_frame(s);
132
+ }
133
+
134
+ static
135
+ struct gzl_parse_stack_frame *pop_intfa_frame(struct gzl_parse_state *s)
136
+ {
137
+ assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type ==
138
+ GZL_FRAME_TYPE_INTFA);
139
+ return pop_frame(s);
140
+ }
141
+
142
+ /*
143
+ * descend_to_gla(): given the current parse stack, pushes any RTN or GLA
144
+ * stack frames representing transitions that can be taken without consuming
145
+ * any terminals.
146
+ *
147
+ * Preconditions:
148
+ * - the current frame is either an RTN frame or a GLA frame
149
+ *
150
+ * Postconditions:
151
+ * - the current frame is an RTN frame or a GLA frame. If a new GLA frame was
152
+ * entered, entered_gla is set to true.
153
+ */
154
+ static
155
+ enum gzl_status descend_to_gla(struct gzl_parse_state *s, bool *entered_gla,
156
+ struct gzl_offset *start_offset)
157
+ {
158
+ *entered_gla = false;
159
+ while(true) {
160
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
161
+ if(frame->frame_type != GZL_FRAME_TYPE_RTN) return GZL_STATUS_OK;
162
+
163
+ /* Subtract 1 because there can be one IntFA frame beyond the RTN and
164
+ * GLA frames this function pushes. */
165
+ if(s->parse_stack_len >= s->max_stack_depth-1)
166
+ return GZL_STATUS_RESOURCE_LIMIT_EXCEEDED;
167
+
168
+ struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
169
+ switch(rtn_frame->rtn_state->lookahead_type) {
170
+ case GZL_STATE_HAS_INTFA:
171
+ return GZL_STATUS_OK;
172
+
173
+ case GZL_STATE_HAS_GLA:
174
+ *entered_gla = true;
175
+ push_gla_frame(s, rtn_frame->rtn_state->d.state_gla, start_offset);
176
+ return GZL_STATUS_OK;
177
+
178
+ case GZL_STATE_HAS_NEITHER:
179
+ /* An RTN state has neither an IntFA or a GLA in only two cases:
180
+ * - it is a final state with no outgoing transitions
181
+ * - it is a nonfinal state with only one transition (a nonterminal)
182
+ */
183
+ assert(rtn_frame->rtn_state->num_transitions < 2);
184
+ enum gzl_status status = GZL_STATUS_OK;
185
+ if(rtn_frame->rtn_state->num_transitions == 0)
186
+ status = pop_rtn_frame(s); /* Final state */
187
+ else if(rtn_frame->rtn_state->num_transitions == 1) {
188
+ assert(rtn_frame->rtn_state->transitions[0].transition_type ==
189
+ GZL_NONTERM_TRANSITION);
190
+ status = push_rtn_frame_for_transition(
191
+ s, &rtn_frame->rtn_state->transitions[0], start_offset);
192
+ }
193
+ if(status != GZL_STATUS_OK) return status;
194
+ break;
195
+ }
196
+ }
197
+ }
198
+
199
+ static
200
+ struct gzl_intfa_frame *push_intfa_frame_for_gla_or_rtn(
201
+ struct gzl_parse_state *s)
202
+ {
203
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
204
+ if(frame->frame_type == GZL_FRAME_TYPE_GLA) {
205
+ struct gzl_gla_state *gla_state = frame->f.gla_frame.gla_state;
206
+ assert(gla_state->is_final == false);
207
+ return push_intfa_frame(s, gla_state->d.nonfinal.intfa, &s->offset);
208
+ } else if(frame->frame_type == GZL_FRAME_TYPE_RTN) {
209
+ struct gzl_rtn_state *rtn_state = frame->f.rtn_frame.rtn_state;
210
+ assert(rtn_state->lookahead_type == GZL_STATE_HAS_INTFA);
211
+ return push_intfa_frame(s, rtn_state->d.state_intfa, &s->offset);
212
+ }
213
+ assert(false); /* should never reach here. */
214
+ return NULL;
215
+ }
216
+
217
+ static
218
+ enum gzl_status do_rtn_terminal_transition(struct gzl_parse_state *s,
219
+ struct gzl_rtn_transition *t,
220
+ struct gzl_terminal *terminal)
221
+ {
222
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
223
+ assert(frame->frame_type == GZL_FRAME_TYPE_RTN);
224
+ struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
225
+ rtn_frame->rtn_transition = t;
226
+ if(s->bound_grammar->terminal_cb)
227
+ s->bound_grammar->terminal_cb(s, terminal);
228
+ assert(t->transition_type == GZL_TERMINAL_TRANSITION);
229
+ rtn_frame->rtn_state = t->dest_state;
230
+ return GZL_STATUS_OK;
231
+ }
232
+
233
+ static
234
+ struct gzl_rtn_transition *find_rtn_terminal_transition(
235
+ struct gzl_rtn_state *rtn_state, struct gzl_terminal *terminal)
236
+ {
237
+ int i;
238
+ for(i = 0; i < rtn_state->num_transitions; i++) {
239
+ struct gzl_rtn_transition *t = &rtn_state->transitions[i];
240
+ if(t->transition_type == GZL_TERMINAL_TRANSITION &&
241
+ t->edge.terminal_name == terminal->name)
242
+ return t;
243
+ }
244
+ return NULL;
245
+ }
246
+
247
+ static
248
+ struct gzl_gla_transition *find_gla_transition(struct gzl_gla_state *gla_state,
249
+ char *term_name)
250
+ {
251
+ int i;
252
+ for(i = 0; i < gla_state->d.nonfinal.num_transitions; i++) {
253
+ struct gzl_gla_transition *t = &gla_state->d.nonfinal.transitions[i];
254
+ if(t->term == term_name)
255
+ return t;
256
+ }
257
+ return NULL;
258
+ }
259
+
260
+ static
261
+ struct gzl_intfa_transition *find_intfa_transition(
262
+ struct gzl_intfa_state *intfa_state, char ch)
263
+ {
264
+ int i;
265
+ for(i = 0; i < intfa_state->num_transitions; i++) {
266
+ struct gzl_intfa_transition *t = &intfa_state->transitions[i];
267
+ if(ch >= t->ch_low && ch <= t->ch_high)
268
+ return t;
269
+ }
270
+ return NULL;
271
+ }
272
+
273
+ /*
274
+ * do_gla_transition(): transitions a GLA frame, performing the appropriate
275
+ * RTN transitions if this puts the GLA in a final state.
276
+ *
277
+ * Preconditions:
278
+ * - the current stack frame is a GLA frame
279
+ * - term is a terminal that came from this GLA state's intfa
280
+ *
281
+ * Postconditions:
282
+ * - the current stack frame is a GLA frame (this would indicate that
283
+ * the GLA hasn't hit a final state yet) or the current stack frame is
284
+ * an RTN frame (indicating we *have* hit a final state in the GLA)
285
+ */
286
+ static
287
+ enum gzl_status do_gla_transition(struct gzl_parse_state *s,
288
+ struct gzl_terminal *term,
289
+ size_t *rtn_term_offset)
290
+ {
291
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
292
+ assert(frame->frame_type == GZL_FRAME_TYPE_GLA);
293
+ assert(frame->f.gla_frame.gla_state->is_final == false);
294
+ struct gzl_gla_state *gla_state = frame->f.gla_frame.gla_state;
295
+ struct gzl_gla_state *dest_gla_state = NULL;
296
+
297
+ /* Find the transition. */
298
+ struct gzl_gla_transition *t = find_gla_transition(gla_state, term->name);
299
+ if(!t) {
300
+ /* Parse error: terminal for which we had no GLA transition. */
301
+ if(s->bound_grammar->error_terminal_cb)
302
+ s->bound_grammar->error_terminal_cb(s, term);
303
+ return GZL_STATUS_ERROR;
304
+ }
305
+ /* Perform the transition. */
306
+ assert(t->dest_state);
307
+ frame->f.gla_frame.gla_state = t->dest_state;
308
+ dest_gla_state = t->dest_state;
309
+
310
+ /* Perform appropriate actions if we're in a final state. */
311
+ enum gzl_status status = GZL_STATUS_OK;
312
+ if(dest_gla_state->is_final) {
313
+ /* Pop the GLA frame (since now we know what RTN transition to take)
314
+ * and use its information to make an RTN transition. */
315
+ int offset = dest_gla_state->d.final.transition_offset;
316
+ frame = pop_gla_frame(s);
317
+ if(offset == 0)
318
+ status = pop_rtn_frame(s);
319
+ else {
320
+ struct gzl_rtn_state *rtn_state = frame->f.rtn_frame.rtn_state;
321
+ struct gzl_rtn_transition *t = &rtn_state->transitions[offset-1];
322
+ struct gzl_terminal *next_term = &s->token_buffer[*rtn_term_offset];
323
+ if(t->transition_type == GZL_TERMINAL_TRANSITION) {
324
+ /* The transition must match what we have in the token buffer */
325
+ assert(next_term->name == t->edge.terminal_name);
326
+ (*rtn_term_offset)++;
327
+ status = do_rtn_terminal_transition(s, t, next_term);
328
+ } else
329
+ status = push_rtn_frame_for_transition(
330
+ s, t, &next_term->offset);
331
+ }
332
+ }
333
+ return status;
334
+ }
335
+
336
+ /*
337
+ * process_terminal(): processes a terminal that was just lexed, possibly
338
+ * triggering a series of RTN and/or GLA transitions.
339
+ *
340
+ * Preconditions:
341
+ * - the current stack frame is an intfa frame representing the intfa that
342
+ * just produced this terminal
343
+ * - the given terminal can be recognized by the current GLA or RTN state
344
+ *
345
+ * Postconditions:
346
+ * - the current stack frame is an GLA or RTN frame representing the state after
347
+ * all available GLA and RTN transitions have been taken.
348
+ */
349
+
350
+ static
351
+ enum gzl_status process_terminal(struct gzl_parse_state *s, char *term_name,
352
+ struct gzl_offset *start_offset, int len)
353
+ {
354
+ pop_intfa_frame(s);
355
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
356
+ size_t rtn_term_offset = 0;
357
+ size_t gla_term_offset = s->token_buffer_len;
358
+
359
+ RESIZE_DYNARRAY(s->token_buffer, s->token_buffer_len+1);
360
+ if(s->token_buffer_len >= s->max_lookahead)
361
+ return GZL_STATUS_RESOURCE_LIMIT_EXCEEDED;
362
+
363
+ struct gzl_terminal *term = DYNARRAY_GET_TOP(s->token_buffer);
364
+ term->name = term_name;
365
+ term->offset = *start_offset;
366
+ term->len = len;
367
+
368
+ /* Feed tokens to RTNs and GLAs until we have processed all the tokens we
369
+ * have. */
370
+ enum gzl_status status = GZL_STATUS_OK;
371
+ enum gzl_frame_type frame_type = frame->frame_type;
372
+ do {
373
+ /* Take one terminal transition, for either an RTN or a GLA. */
374
+ if(frame_type == GZL_FRAME_TYPE_RTN) {
375
+ struct gzl_terminal *rtn_term = &s->token_buffer[rtn_term_offset];
376
+ struct gzl_rtn_transition *t;
377
+ rtn_term_offset++;
378
+
379
+ if(rtn_term->name == NULL)
380
+ /* Skip: RTNs don't process EOF as a terminal, only GLAs do. */
381
+ continue;
382
+ t = find_rtn_terminal_transition(frame->f.rtn_frame.rtn_state,
383
+ rtn_term);
384
+ if(!t) {
385
+ /* Parse error: terminal for which we had no RTN transition. */
386
+ if(s->bound_grammar->error_terminal_cb)
387
+ s->bound_grammar->error_terminal_cb(s, term);
388
+ return GZL_STATUS_ERROR;
389
+ }
390
+ status = do_rtn_terminal_transition(s, t, rtn_term);
391
+ } else {
392
+ struct gzl_terminal *gla_term = &s->token_buffer[gla_term_offset++];
393
+ status = do_gla_transition(s, gla_term, &rtn_term_offset);
394
+ }
395
+
396
+ /* Having taken a transition, push any new frames onto the stack. */
397
+ if(status == GZL_STATUS_OK) {
398
+ bool entered_gla;
399
+ if(rtn_term_offset < s->token_buffer_len)
400
+ status = descend_to_gla(
401
+ s, &entered_gla, &s->token_buffer[rtn_term_offset].offset);
402
+ else
403
+ status = descend_to_gla(s, &entered_gla, &s->offset);
404
+
405
+ if(entered_gla)
406
+ gla_term_offset = rtn_term_offset;
407
+ }
408
+
409
+ if(status == GZL_STATUS_OK) {
410
+ assert(s->parse_stack_len > 0);
411
+ frame = DYNARRAY_GET_TOP(s->parse_stack);
412
+ frame_type = frame->frame_type;
413
+ }
414
+ }
415
+ while(status == GZL_STATUS_OK &&
416
+ ((frame_type == GZL_FRAME_TYPE_RTN &&
417
+ rtn_term_offset < s->token_buffer_len) ||
418
+ (frame_type == GZL_FRAME_TYPE_GLA &&
419
+ gla_term_offset < s->token_buffer_len)));
420
+
421
+ /* We can have an EOF left over in the token buffer if the EOF token led us
422
+ * to a hard EOF, thus terminating the above loop before our "skip" above
423
+ * could cover this EOF special case. */
424
+ if(rtn_term_offset < s->token_buffer_len &&
425
+ s->token_buffer[rtn_term_offset].name == NULL)
426
+ rtn_term_offset++;
427
+
428
+ /* At this point we have consumed some (but possibly not all) of the
429
+ * terminals we have lexed. We consider a token fully consumed when it
430
+ * has caused an RTN transition (just a GLA transition doesn't leave the
431
+ * token consumed, because it will be used again for an RTN transition
432
+ * later.
433
+ *
434
+ * We now remove the consumed terminals from token_buffer. */
435
+ size_t remaining_terminals = s->token_buffer_len - rtn_term_offset;
436
+ if(remaining_terminals > 0)
437
+ memmove(s->token_buffer, s->token_buffer + rtn_term_offset,
438
+ remaining_terminals * sizeof(*s->token_buffer));
439
+ RESIZE_DYNARRAY(s->token_buffer, remaining_terminals);
440
+
441
+ /* Update open_terminal_offset. */
442
+ if(remaining_terminals > 0)
443
+ s->open_terminal_offset = s->token_buffer[0].offset;
444
+ else
445
+ s->open_terminal_offset = s->offset;
446
+
447
+ return status;
448
+ }
449
+
450
+
451
+ /*
452
+ * do_intfa_transition(): transitions an IntFA frame according to the given
453
+ * char, performing the appropriate GLA/RTN transitions if this puts the IntFA
454
+ * in a final state.
455
+ *
456
+ * Preconditions:
457
+ * - the current stack frame is an IntFA frame
458
+ *
459
+ * Postconditions:
460
+ * - the current stack frame is an IntFA frame unless we have hit a
461
+ * hard EOF in which case it is an RTN frame. Note that it could be either
462
+ * same IntFA frame or a different one.
463
+ *
464
+ * Note: we currently implement longest-match, assuming that the first
465
+ * non-matching character is only one longer than the longest match.
466
+ */
467
+ static
468
+ enum gzl_status do_intfa_transition(struct gzl_parse_state *s,
469
+ char ch)
470
+ {
471
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
472
+ assert(frame->frame_type == GZL_FRAME_TYPE_INTFA);
473
+ struct gzl_intfa_frame *intfa_frame = &frame->f.intfa_frame;
474
+ struct gzl_intfa_transition *t = find_intfa_transition(
475
+ intfa_frame->intfa_state, ch);
476
+ enum gzl_status status;
477
+
478
+ /* If this character did not have any transition, but the state we're coming
479
+ * from is final, then longest-match semantics say that we should return
480
+ * the last character's final state as the token. But if the state we're
481
+ * coming from is *not* final, it's just a parse error. */
482
+ if(!t) {
483
+ char *terminal = intfa_frame->intfa_state->final;
484
+ assert(terminal); /* TODO: handle this case. */
485
+ status = process_terminal(s, terminal, &frame->start_offset,
486
+ s->offset.byte - frame->start_offset.byte);
487
+ if(status != GZL_STATUS_OK) return status;
488
+ intfa_frame = push_intfa_frame_for_gla_or_rtn(s);
489
+ t = find_intfa_transition(intfa_frame->intfa_state, ch);
490
+ if(!t) {
491
+ /* Parse error: we encountered a character for which we have no
492
+ * transition. */
493
+ if(s->bound_grammar->error_char_cb)
494
+ s->bound_grammar->error_char_cb(s, ch);
495
+ return GZL_STATUS_ERROR;
496
+ }
497
+ }
498
+
499
+ /* We have finished processing transitions for the previous byte.
500
+ * Move on to the next byte. */
501
+ s->offset.byte++;
502
+
503
+ /* Deal with newlines. This is all very single-byte-encoding specific for
504
+ * the moment. */
505
+ bool is_newline_char = (ch == 0x0A || ch == 0x0D); /* LF and CR */
506
+ if(is_newline_char) {
507
+ if(!s->last_char_was_newline) {
508
+ s->offset.line++;
509
+ s->offset.column = 1;
510
+ }
511
+ }
512
+ else
513
+ s->offset.column++;
514
+ s->last_char_was_newline = is_newline_char;
515
+
516
+ /* Do the transition. */
517
+ intfa_frame->intfa_state = t->dest_state;
518
+
519
+ /* If the current state is final and there are no outgoing transitions,
520
+ * we *know* we don't have to wait any longer for the longest match.
521
+ * Transition the RTN or GLA now, for more on-line behavior. */
522
+ if(intfa_frame->intfa_state->final &&
523
+ (intfa_frame->intfa_state->num_transitions == 0)) {
524
+ status = process_terminal(s, intfa_frame->intfa_state->final,
525
+ &frame->start_offset,
526
+ s->offset.byte - frame->start_offset.byte);
527
+ if(status != GZL_STATUS_OK)
528
+ return status;
529
+ push_intfa_frame_for_gla_or_rtn(s);
530
+ }
531
+ return GZL_STATUS_OK;
532
+ }
533
+
534
+ /*
535
+ * The rest of this file is the publicly-exposed API, documented in the
536
+ * header file.
537
+ */
538
+
539
+ enum gzl_status gzl_parse(struct gzl_parse_state *s, char *buf, size_t buf_len)
540
+ {
541
+ enum gzl_status status = GZL_STATUS_OK;
542
+ size_t i;
543
+
544
+ /* For the first call, we need to push the initial frame and
545
+ * descend from the starting frame until we hit an IntFA frame. */
546
+ if(s->offset.byte == 0 && s->parse_stack_len == 0) {
547
+ push_rtn_frame(s, &s->bound_grammar->grammar->rtns[0], &s->offset);
548
+ bool entered_gla;
549
+ status = descend_to_gla(s, &entered_gla, &s->offset);
550
+ if(status == GZL_STATUS_OK) push_intfa_frame_for_gla_or_rtn(s);
551
+ }
552
+ if(s->parse_stack_len == 0) {
553
+ /* This gzl_parse_state has already hit hard EOF previously. */
554
+ return GZL_STATUS_HARD_EOF;
555
+ }
556
+
557
+ for(i = 0; i < buf_len && status == GZL_STATUS_OK; i++)
558
+ status = do_intfa_transition(s, buf[i]);
559
+ return status;
560
+ }
561
+
562
+ bool gzl_finish_parse(struct gzl_parse_state *s)
563
+ {
564
+ size_t i;
565
+ /* First deal with an open IntFA frame if there is one. The frame must
566
+ * be in a start state (in which case we back it out), a final state
567
+ * (in which case we recognize and process the terminal), or both (in
568
+ * which case we back out iff. we are in a GLA state with an EOF transition
569
+ * out). */
570
+ struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
571
+ if(frame->frame_type == GZL_FRAME_TYPE_INTFA) {
572
+ struct gzl_intfa_frame *intfa_frame = &frame->f.intfa_frame;
573
+ if(intfa_frame->intfa_state->final &&
574
+ intfa_frame->intfa_state == &intfa_frame->intfa->states[0]) {
575
+ /* TODO: handle this case. */
576
+ assert(false);
577
+ } else if(intfa_frame->intfa_state->final) {
578
+ process_terminal(s, intfa_frame->intfa_state->final,
579
+ &frame->start_offset,
580
+ s->offset.byte - frame->start_offset.byte);
581
+ } else if(intfa_frame->intfa_state == &intfa_frame->intfa->states[0]) {
582
+ /* Pop the frame like it never happened. */
583
+ pop_intfa_frame(s);
584
+ } else {
585
+ /* IntFA is in neither a start nor a final state.
586
+ * This cannot be EOF. */
587
+ return false;
588
+ }
589
+ }
590
+
591
+ /* Next deal with an open GLA frame if there is one. The frame must be in
592
+ * a start state or have an outgoing EOF transition, else we are not at
593
+ * valid EOF. */
594
+ frame = DYNARRAY_GET_TOP(s->parse_stack);
595
+ if(frame->frame_type == GZL_FRAME_TYPE_GLA) {
596
+ struct gzl_gla_frame *gla_frame = &frame->f.gla_frame;
597
+ if(gla_frame->gla_state == &gla_frame->gla->states[0]) {
598
+ /* GLA is in a start state -- fine, we can just pop it as
599
+ * if it never happened. */
600
+ pop_gla_frame(s);
601
+ } else {
602
+ /* For this to still be valid EOF, this GLA state must have an
603
+ * outgoing EOF transition, and we must take it now. */
604
+ struct gzl_gla_transition *t =
605
+ find_gla_transition(gla_frame->gla_state, NULL);
606
+ if(!t) return false;
607
+
608
+ /* process_terminal() wants an IntFA frame to pop. */
609
+ push_empty_frame(s, GZL_FRAME_TYPE_INTFA, &s->offset);
610
+ process_terminal(s, NULL, &s->offset, 0);
611
+
612
+ /* Pop any GLA states that the previous may have pushed. */
613
+ while(s->parse_stack_len > 0 &&
614
+ DYNARRAY_GET_TOP(s->parse_stack)->frame_type !=
615
+ GZL_FRAME_TYPE_RTN)
616
+ pop_frame(s);
617
+ }
618
+ }
619
+
620
+ /* Now we should have only RTN frames open. Starting from the top, check
621
+ * that each frame's dest_state is a final state (or the actual current
622
+ * state in the bottommost frame). */
623
+ if(s->parse_stack_len > 0) { /* will be 0 if we already hit hard EOF. */
624
+ for(i = 0; i < s->parse_stack_len - 1; i++) {
625
+ frame = &s->parse_stack[i];
626
+ assert(frame->frame_type == GZL_FRAME_TYPE_RTN);
627
+ struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
628
+ assert(rtn_frame->rtn_transition);
629
+ if(!rtn_frame->rtn_transition->dest_state->is_final) return false;
630
+ }
631
+
632
+ frame = DYNARRAY_GET_TOP(s->parse_stack);
633
+ struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
634
+ if(!rtn_frame->rtn_state->is_final) return false;
635
+
636
+ /* We are truly in a state where EOF is ok. Pop remaining RTN frames to
637
+ * call callbacks appropriately. */
638
+ while(s->parse_stack_len > 0)
639
+ {
640
+ /* What should we do if the user cancels while the final RTN frames
641
+ * are being popped? It's kind of a weird thing to do. Options
642
+ * are to ignore it (we're finishing the parse anyway) or to stop.
643
+ * For now we ignore. */
644
+ pop_rtn_frame(s);
645
+ }
646
+ }
647
+
648
+ return true;
649
+ }
650
+
651
+ struct gzl_parse_state *gzl_alloc_parse_state()
652
+ {
653
+ struct gzl_parse_state *state = malloc(sizeof(*state));
654
+ INIT_DYNARRAY(state->parse_stack, 0, 16);
655
+ INIT_DYNARRAY(state->token_buffer, 0, 2);
656
+ return state;
657
+ }
658
+
659
+ struct gzl_parse_state *gzl_dup_parse_state(struct gzl_parse_state *orig)
660
+ {
661
+ struct gzl_parse_state *copy = malloc(sizeof(*copy));
662
+ /* This erroneously copies pointers to dynarrays, but we'll fix in a sec. */
663
+ *copy = *orig;
664
+ size_t i;
665
+
666
+ INIT_DYNARRAY(copy->parse_stack, 0, 16);
667
+ RESIZE_DYNARRAY(copy->parse_stack, orig->parse_stack_len);
668
+ for(i = 0; i < orig->parse_stack_len; i++)
669
+ copy->parse_stack[i] = orig->parse_stack[i];
670
+
671
+ INIT_DYNARRAY(copy->token_buffer, 0, 2);
672
+ RESIZE_DYNARRAY(copy->token_buffer, orig->token_buffer_len);
673
+ for(i = 0; i < orig->token_buffer_len; i++)
674
+ copy->token_buffer[i] = orig->token_buffer[i];
675
+
676
+ return copy;
677
+ }
678
+
679
+ void gzl_free_parse_state(struct gzl_parse_state *s)
680
+ {
681
+ FREE_DYNARRAY(s->parse_stack);
682
+ FREE_DYNARRAY(s->token_buffer);
683
+ free(s);
684
+ }
685
+
686
+ void gzl_init_parse_state(struct gzl_parse_state *s,
687
+ struct gzl_bound_grammar *bg)
688
+ {
689
+ s->offset.byte = 0;
690
+ s->offset.line = 1;
691
+ s->offset.column = 1;
692
+ s->open_terminal_offset = s->offset;
693
+ s->last_char_was_newline = false;
694
+ s->bound_grammar = bg;
695
+ RESIZE_DYNARRAY(s->parse_stack, 0);
696
+ RESIZE_DYNARRAY(s->token_buffer, 0);
697
+
698
+ /* Currently each stack frame takes 28 bytes on a 32-bit machine, so a
699
+ * stack depth of 500 is a modest 14kb of RAM. 500 frames of recursion is
700
+ * far deeper than we would expect any real text to be */
701
+ s->max_stack_depth = 500;
702
+
703
+ /* Currently each token of lookahead takes 20 bytes on a 32-bit machine, so
704
+ * a lookahead depth of 500 is 10kb of RAM. Input text would have to be
705
+ * truly pathological to require this much lookahead. */
706
+ s->max_lookahead = 500;
707
+ }
708
+
709
+ enum gzl_status gzl_parse_file(struct gzl_parse_state *state,
710
+ FILE *file, void *user_data,
711
+ size_t max_buffer_size)
712
+ {
713
+ struct gzl_buffer *buffer = malloc(sizeof(*buffer));
714
+ INIT_DYNARRAY(buffer->buf, 0, 4096);
715
+ buffer->buf_offset = 0;
716
+ buffer->bytes_parsed = 0;
717
+ buffer->user_data = user_data;
718
+ state->user_data = buffer;
719
+
720
+ /* The minimum amount of the data in the buffer that we want to be new data
721
+ * each time. This number shrinks as the amount of data we're preserving
722
+ * from open tokens grows. If the number is below this number we increase
723
+ * our buffer size. */
724
+ const size_t min_new_data = 4000;
725
+
726
+ enum gzl_status status;
727
+ bool is_eof = false;
728
+ do {
729
+ /* Make sure we have space for at least min_new_data new data. */
730
+ size_t new_buf_size = buffer->buf_size;
731
+ while(buffer->buf_len + min_new_data > new_buf_size)
732
+ buffer->buf_size *= 2;
733
+ if(new_buf_size > max_buffer_size) {
734
+ status = GZL_STATUS_RESOURCE_LIMIT_EXCEEDED;
735
+ break;
736
+ }
737
+ if(new_buf_size != buffer->buf_size) {
738
+ buffer->buf_size = new_buf_size;
739
+ buffer->buf = realloc(buffer->buf, new_buf_size);
740
+ }
741
+ size_t bytes_to_read = buffer->buf_size - buffer->buf_len;
742
+
743
+ /* Do the I/O and check for errors. */
744
+ size_t bytes_read = fread(buffer->buf + buffer->buf_len, 1,
745
+ bytes_to_read, file);
746
+ if(bytes_read < bytes_to_read) {
747
+ if(ferror(file)) {
748
+ status = GZL_STATUS_IO_ERROR;
749
+ break;
750
+ } else if(feof(file)) {
751
+ is_eof = true;
752
+ }
753
+ }
754
+
755
+ /* Do the parse. Start past whatever bytes we previously saved. */
756
+ char *parse_start = buffer->buf + buffer->buf_len;
757
+ buffer->buf_len += bytes_read;
758
+ status = gzl_parse(state, parse_start, bytes_read);
759
+
760
+ /* Preserve all data from tokens that haven't been returned yet:
761
+ *
762
+ * buf size len
763
+ * | | |
764
+ * v v v
765
+ * Buffer: -----------------------------------------------------------
766
+ * ^ ^ ^ ^
767
+ * | | | |
768
+ * buf_offset | | state->offset
769
+ * | |
770
+ * previous value of current value of
771
+ * state->open_terminal_offset state->open_terminal_offset
772
+ *
773
+ * |----| <-- Data we were previously saving.
774
+ *
775
+ * Data we should now be saving --> |------------|
776
+ */
777
+
778
+ size_t bytes_to_discard = state->open_terminal_offset.byte -
779
+ buffer->buf_offset;
780
+ size_t bytes_to_save = buffer->buf_size - bytes_to_discard;
781
+ char *buf_to_save_from = buffer->buf + bytes_to_discard;
782
+ assert(bytes_to_discard <= (size_t) buffer->buf_len); /* hasn't overflowed. */
783
+
784
+ memmove(buffer->buf, buf_to_save_from, bytes_to_save);
785
+ buffer->buf_offset += bytes_to_discard;
786
+ buffer->buf_len = bytes_to_save;
787
+ } while(status == GZL_STATUS_OK && !is_eof);
788
+
789
+ if(status == GZL_STATUS_HARD_EOF || (status == GZL_STATUS_OK && is_eof)) {
790
+ if(gzl_finish_parse(state)) {
791
+ if(!feof(file) || buffer->buf_len > 0) {
792
+ /* There was data left over -- we hit grammar EOF before
793
+ * file EOF. */
794
+ status = GZL_STATUS_OK;
795
+ } else
796
+ status = GZL_STATUS_PREMATURE_EOF_ERROR;
797
+ } else
798
+ status = GZL_STATUS_PREMATURE_EOF_ERROR;
799
+ }
800
+
801
+ FREE_DYNARRAY(buffer->buf);
802
+ free(buffer);
803
+ return status;
804
+ }
805
+
806
+ /*
807
+ * Local Variables:
808
+ * c-file-style: "bsd"
809
+ * c-basic-offset: 4
810
+ * indent-tabs-mode: nil
811
+ * End:
812
+ * vim:et:sts=4:sw=4
813
+ */