smtlaissezfaire-gazelle 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +4 -0
- data/ext/gazelle_ruby_bindings/extconf.rb +6 -0
- data/ext/gazelle_ruby_bindings/gazelle_ruby_bindings.c +119 -0
- data/ext/gazelle_ruby_bindings/gazelle_ruby_bindings.h +20 -0
- data/ext/gazelle_ruby_bindings/includes/bc_read_stream.c +872 -0
- data/ext/gazelle_ruby_bindings/includes/load_grammar.c +563 -0
- data/ext/gazelle_ruby_bindings/includes/parse.c +813 -0
- data/lib/gazelle.rb +11 -0
- data/lib/gazelle/gemspec.rb +39 -0
- data/lib/gazelle/parser.rb +31 -0
- data/spec/gazelle_integration_spec.rb +97 -0
- data/spec/hello.gzc +0 -0
- data/spec/hello.gzl +1 -0
- data/spec/invalid_format.gzc +0 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +5 -0
- data/tasks/c_extensions.rake +4 -0
- data/tasks/flog.rake +10 -0
- data/tasks/gem.rake +8 -0
- data/tasks/rspec.rake +20 -0
- data/tasks/sloc.rake +16 -0
- data/tasks/tags.rake +23 -0
- metadata +75 -0
@@ -0,0 +1,813 @@
|
|
1
|
+
/*********************************************************************
|
2
|
+
|
3
|
+
Gazelle: a system for building fast, reusable parsers
|
4
|
+
|
5
|
+
interpreter.c
|
6
|
+
|
7
|
+
Once a compiled grammar has been loaded into memory, the routines
|
8
|
+
in this file are what actually does the parsing. This file is an
|
9
|
+
"interpreter" in the sense that it parses the input by using the
|
10
|
+
grammar as a data structure -- no grammar-specific code is ever
|
11
|
+
generated or executed. Despite this, it is still quite fast, and
|
12
|
+
has a very low memory footprint.
|
13
|
+
|
14
|
+
The interpreter primarily consists of maintaining the parse stack
|
15
|
+
properly and transitioning the frames in response to the input.
|
16
|
+
|
17
|
+
Copyright (c) 2007-2009 Joshua Haberman. See LICENSE for details.
|
18
|
+
|
19
|
+
*********************************************************************/
|
20
|
+
|
21
|
+
#include <stdio.h>
|
22
|
+
#include <stdlib.h>
|
23
|
+
#include <assert.h>
|
24
|
+
#include <string.h>
|
25
|
+
|
26
|
+
#include "gazelle/parse.h"
|
27
|
+
|
28
|
+
/*
|
29
|
+
* The following are stack-manipulation functions. Gazelle maintains a runtime
|
30
|
+
* stack (which is completely separate from the C stack), and these functions
|
31
|
+
* provide pushing and popping of different kinds of stack frames.
|
32
|
+
*/
|
33
|
+
|
34
|
+
static
|
35
|
+
struct gzl_parse_stack_frame *push_empty_frame(struct gzl_parse_state *s,
|
36
|
+
enum gzl_frame_type frame_type,
|
37
|
+
struct gzl_offset *start_offset)
|
38
|
+
{
|
39
|
+
RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len+1);
|
40
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
41
|
+
frame->frame_type = frame_type;
|
42
|
+
frame->start_offset = *start_offset;
|
43
|
+
return frame;
|
44
|
+
}
|
45
|
+
|
46
|
+
static
|
47
|
+
struct gzl_intfa_frame *push_intfa_frame(struct gzl_parse_state *s,
|
48
|
+
struct gzl_intfa *intfa,
|
49
|
+
struct gzl_offset *start_offset)
|
50
|
+
{
|
51
|
+
struct gzl_parse_stack_frame *frame =
|
52
|
+
push_empty_frame(s, GZL_FRAME_TYPE_INTFA, start_offset);
|
53
|
+
struct gzl_intfa_frame *intfa_frame = &frame->f.intfa_frame;
|
54
|
+
intfa_frame->intfa = intfa;
|
55
|
+
intfa_frame->intfa_state = &intfa->states[0];
|
56
|
+
return intfa_frame;
|
57
|
+
}
|
58
|
+
|
59
|
+
static
|
60
|
+
struct gzl_parse_stack_frame *push_gla_frame(struct gzl_parse_state *s,
|
61
|
+
struct gzl_gla *gla,
|
62
|
+
struct gzl_offset *start_offset)
|
63
|
+
{
|
64
|
+
struct gzl_parse_stack_frame *frame =
|
65
|
+
push_empty_frame(s, GZL_FRAME_TYPE_GLA, start_offset);
|
66
|
+
struct gzl_gla_frame *gla_frame = &frame->f.gla_frame;
|
67
|
+
gla_frame->gla = gla;
|
68
|
+
gla_frame->gla_state = &gla->states[0];
|
69
|
+
return frame;
|
70
|
+
}
|
71
|
+
|
72
|
+
static
|
73
|
+
enum gzl_status push_rtn_frame(struct gzl_parse_state *s,
|
74
|
+
struct gzl_rtn *rtn,
|
75
|
+
struct gzl_offset *start_offset)
|
76
|
+
{
|
77
|
+
struct gzl_parse_stack_frame *new_frame =
|
78
|
+
push_empty_frame(s, GZL_FRAME_TYPE_RTN, start_offset);
|
79
|
+
struct gzl_rtn_frame *new_rtn_frame = &new_frame->f.rtn_frame;
|
80
|
+
new_rtn_frame->rtn = rtn;
|
81
|
+
new_rtn_frame->rtn_transition = NULL;
|
82
|
+
new_rtn_frame->rtn_state = &new_rtn_frame->rtn->states[0];
|
83
|
+
if(s->bound_grammar->start_rule_cb) s->bound_grammar->start_rule_cb(s);
|
84
|
+
return GZL_STATUS_OK;
|
85
|
+
}
|
86
|
+
|
87
|
+
static
|
88
|
+
enum gzl_status push_rtn_frame_for_transition(struct gzl_parse_state *s,
|
89
|
+
struct gzl_rtn_transition *t,
|
90
|
+
struct gzl_offset *start_offset)
|
91
|
+
{
|
92
|
+
struct gzl_rtn_frame *old_rtn_frame =
|
93
|
+
&DYNARRAY_GET_TOP(s->parse_stack)->f.rtn_frame;
|
94
|
+
old_rtn_frame->rtn_transition = t;
|
95
|
+
return push_rtn_frame(s, t->edge.nonterminal, start_offset);
|
96
|
+
}
|
97
|
+
|
98
|
+
static
|
99
|
+
struct gzl_parse_stack_frame *pop_frame(struct gzl_parse_state *s)
|
100
|
+
{
|
101
|
+
assert(s->parse_stack_len > 0);
|
102
|
+
RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len-1);
|
103
|
+
return s->parse_stack_len > 0 ? DYNARRAY_GET_TOP(s->parse_stack) : NULL;
|
104
|
+
}
|
105
|
+
|
106
|
+
static
|
107
|
+
enum gzl_status pop_rtn_frame(struct gzl_parse_state *s)
|
108
|
+
{
|
109
|
+
assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == GZL_FRAME_TYPE_RTN);
|
110
|
+
if(s->bound_grammar->end_rule_cb) s->bound_grammar->end_rule_cb(s);
|
111
|
+
|
112
|
+
struct gzl_parse_stack_frame *frame = pop_frame(s);
|
113
|
+
if(frame) {
|
114
|
+
assert(frame->frame_type == GZL_FRAME_TYPE_RTN);
|
115
|
+
struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
|
116
|
+
if(rtn_frame->rtn_transition)
|
117
|
+
rtn_frame->rtn_state = rtn_frame->rtn_transition->dest_state;
|
118
|
+
else {
|
119
|
+
/* Should only happen at the top level. */
|
120
|
+
assert(s->parse_stack_len == 1);
|
121
|
+
}
|
122
|
+
return GZL_STATUS_OK;
|
123
|
+
} else
|
124
|
+
return GZL_STATUS_HARD_EOF;
|
125
|
+
}
|
126
|
+
|
127
|
+
static
|
128
|
+
struct gzl_parse_stack_frame *pop_gla_frame(struct gzl_parse_state *s)
|
129
|
+
{
|
130
|
+
assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == GZL_FRAME_TYPE_GLA);
|
131
|
+
return pop_frame(s);
|
132
|
+
}
|
133
|
+
|
134
|
+
static
|
135
|
+
struct gzl_parse_stack_frame *pop_intfa_frame(struct gzl_parse_state *s)
|
136
|
+
{
|
137
|
+
assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type ==
|
138
|
+
GZL_FRAME_TYPE_INTFA);
|
139
|
+
return pop_frame(s);
|
140
|
+
}
|
141
|
+
|
142
|
+
/*
|
143
|
+
* descend_to_gla(): given the current parse stack, pushes any RTN or GLA
|
144
|
+
* stack frames representing transitions that can be taken without consuming
|
145
|
+
* any terminals.
|
146
|
+
*
|
147
|
+
* Preconditions:
|
148
|
+
* - the current frame is either an RTN frame or a GLA frame
|
149
|
+
*
|
150
|
+
* Postconditions:
|
151
|
+
* - the current frame is an RTN frame or a GLA frame. If a new GLA frame was
|
152
|
+
* entered, entered_gla is set to true.
|
153
|
+
*/
|
154
|
+
static
|
155
|
+
enum gzl_status descend_to_gla(struct gzl_parse_state *s, bool *entered_gla,
|
156
|
+
struct gzl_offset *start_offset)
|
157
|
+
{
|
158
|
+
*entered_gla = false;
|
159
|
+
while(true) {
|
160
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
161
|
+
if(frame->frame_type != GZL_FRAME_TYPE_RTN) return GZL_STATUS_OK;
|
162
|
+
|
163
|
+
/* Subtract 1 because there can be one IntFA frame beyond the RTN and
|
164
|
+
* GLA frames this function pushes. */
|
165
|
+
if(s->parse_stack_len >= s->max_stack_depth-1)
|
166
|
+
return GZL_STATUS_RESOURCE_LIMIT_EXCEEDED;
|
167
|
+
|
168
|
+
struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
|
169
|
+
switch(rtn_frame->rtn_state->lookahead_type) {
|
170
|
+
case GZL_STATE_HAS_INTFA:
|
171
|
+
return GZL_STATUS_OK;
|
172
|
+
|
173
|
+
case GZL_STATE_HAS_GLA:
|
174
|
+
*entered_gla = true;
|
175
|
+
push_gla_frame(s, rtn_frame->rtn_state->d.state_gla, start_offset);
|
176
|
+
return GZL_STATUS_OK;
|
177
|
+
|
178
|
+
case GZL_STATE_HAS_NEITHER:
|
179
|
+
/* An RTN state has neither an IntFA or a GLA in only two cases:
|
180
|
+
* - it is a final state with no outgoing transitions
|
181
|
+
* - it is a nonfinal state with only one transition (a nonterminal)
|
182
|
+
*/
|
183
|
+
assert(rtn_frame->rtn_state->num_transitions < 2);
|
184
|
+
enum gzl_status status = GZL_STATUS_OK;
|
185
|
+
if(rtn_frame->rtn_state->num_transitions == 0)
|
186
|
+
status = pop_rtn_frame(s); /* Final state */
|
187
|
+
else if(rtn_frame->rtn_state->num_transitions == 1) {
|
188
|
+
assert(rtn_frame->rtn_state->transitions[0].transition_type ==
|
189
|
+
GZL_NONTERM_TRANSITION);
|
190
|
+
status = push_rtn_frame_for_transition(
|
191
|
+
s, &rtn_frame->rtn_state->transitions[0], start_offset);
|
192
|
+
}
|
193
|
+
if(status != GZL_STATUS_OK) return status;
|
194
|
+
break;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
}
|
198
|
+
|
199
|
+
static
|
200
|
+
struct gzl_intfa_frame *push_intfa_frame_for_gla_or_rtn(
|
201
|
+
struct gzl_parse_state *s)
|
202
|
+
{
|
203
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
204
|
+
if(frame->frame_type == GZL_FRAME_TYPE_GLA) {
|
205
|
+
struct gzl_gla_state *gla_state = frame->f.gla_frame.gla_state;
|
206
|
+
assert(gla_state->is_final == false);
|
207
|
+
return push_intfa_frame(s, gla_state->d.nonfinal.intfa, &s->offset);
|
208
|
+
} else if(frame->frame_type == GZL_FRAME_TYPE_RTN) {
|
209
|
+
struct gzl_rtn_state *rtn_state = frame->f.rtn_frame.rtn_state;
|
210
|
+
assert(rtn_state->lookahead_type == GZL_STATE_HAS_INTFA);
|
211
|
+
return push_intfa_frame(s, rtn_state->d.state_intfa, &s->offset);
|
212
|
+
}
|
213
|
+
assert(false); /* should never reach here. */
|
214
|
+
return NULL;
|
215
|
+
}
|
216
|
+
|
217
|
+
static
|
218
|
+
enum gzl_status do_rtn_terminal_transition(struct gzl_parse_state *s,
|
219
|
+
struct gzl_rtn_transition *t,
|
220
|
+
struct gzl_terminal *terminal)
|
221
|
+
{
|
222
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
223
|
+
assert(frame->frame_type == GZL_FRAME_TYPE_RTN);
|
224
|
+
struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
|
225
|
+
rtn_frame->rtn_transition = t;
|
226
|
+
if(s->bound_grammar->terminal_cb)
|
227
|
+
s->bound_grammar->terminal_cb(s, terminal);
|
228
|
+
assert(t->transition_type == GZL_TERMINAL_TRANSITION);
|
229
|
+
rtn_frame->rtn_state = t->dest_state;
|
230
|
+
return GZL_STATUS_OK;
|
231
|
+
}
|
232
|
+
|
233
|
+
static
|
234
|
+
struct gzl_rtn_transition *find_rtn_terminal_transition(
|
235
|
+
struct gzl_rtn_state *rtn_state, struct gzl_terminal *terminal)
|
236
|
+
{
|
237
|
+
int i;
|
238
|
+
for(i = 0; i < rtn_state->num_transitions; i++) {
|
239
|
+
struct gzl_rtn_transition *t = &rtn_state->transitions[i];
|
240
|
+
if(t->transition_type == GZL_TERMINAL_TRANSITION &&
|
241
|
+
t->edge.terminal_name == terminal->name)
|
242
|
+
return t;
|
243
|
+
}
|
244
|
+
return NULL;
|
245
|
+
}
|
246
|
+
|
247
|
+
static
|
248
|
+
struct gzl_gla_transition *find_gla_transition(struct gzl_gla_state *gla_state,
|
249
|
+
char *term_name)
|
250
|
+
{
|
251
|
+
int i;
|
252
|
+
for(i = 0; i < gla_state->d.nonfinal.num_transitions; i++) {
|
253
|
+
struct gzl_gla_transition *t = &gla_state->d.nonfinal.transitions[i];
|
254
|
+
if(t->term == term_name)
|
255
|
+
return t;
|
256
|
+
}
|
257
|
+
return NULL;
|
258
|
+
}
|
259
|
+
|
260
|
+
static
|
261
|
+
struct gzl_intfa_transition *find_intfa_transition(
|
262
|
+
struct gzl_intfa_state *intfa_state, char ch)
|
263
|
+
{
|
264
|
+
int i;
|
265
|
+
for(i = 0; i < intfa_state->num_transitions; i++) {
|
266
|
+
struct gzl_intfa_transition *t = &intfa_state->transitions[i];
|
267
|
+
if(ch >= t->ch_low && ch <= t->ch_high)
|
268
|
+
return t;
|
269
|
+
}
|
270
|
+
return NULL;
|
271
|
+
}
|
272
|
+
|
273
|
+
/*
|
274
|
+
* do_gla_transition(): transitions a GLA frame, performing the appropriate
|
275
|
+
* RTN transitions if this puts the GLA in a final state.
|
276
|
+
*
|
277
|
+
* Preconditions:
|
278
|
+
* - the current stack frame is a GLA frame
|
279
|
+
* - term is a terminal that came from this GLA state's intfa
|
280
|
+
*
|
281
|
+
* Postconditions:
|
282
|
+
* - the current stack frame is a GLA frame (this would indicate that
|
283
|
+
* the GLA hasn't hit a final state yet) or the current stack frame is
|
284
|
+
* an RTN frame (indicating we *have* hit a final state in the GLA)
|
285
|
+
*/
|
286
|
+
static
|
287
|
+
enum gzl_status do_gla_transition(struct gzl_parse_state *s,
|
288
|
+
struct gzl_terminal *term,
|
289
|
+
size_t *rtn_term_offset)
|
290
|
+
{
|
291
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
292
|
+
assert(frame->frame_type == GZL_FRAME_TYPE_GLA);
|
293
|
+
assert(frame->f.gla_frame.gla_state->is_final == false);
|
294
|
+
struct gzl_gla_state *gla_state = frame->f.gla_frame.gla_state;
|
295
|
+
struct gzl_gla_state *dest_gla_state = NULL;
|
296
|
+
|
297
|
+
/* Find the transition. */
|
298
|
+
struct gzl_gla_transition *t = find_gla_transition(gla_state, term->name);
|
299
|
+
if(!t) {
|
300
|
+
/* Parse error: terminal for which we had no GLA transition. */
|
301
|
+
if(s->bound_grammar->error_terminal_cb)
|
302
|
+
s->bound_grammar->error_terminal_cb(s, term);
|
303
|
+
return GZL_STATUS_ERROR;
|
304
|
+
}
|
305
|
+
/* Perform the transition. */
|
306
|
+
assert(t->dest_state);
|
307
|
+
frame->f.gla_frame.gla_state = t->dest_state;
|
308
|
+
dest_gla_state = t->dest_state;
|
309
|
+
|
310
|
+
/* Perform appropriate actions if we're in a final state. */
|
311
|
+
enum gzl_status status = GZL_STATUS_OK;
|
312
|
+
if(dest_gla_state->is_final) {
|
313
|
+
/* Pop the GLA frame (since now we know what RTN transition to take)
|
314
|
+
* and use its information to make an RTN transition. */
|
315
|
+
int offset = dest_gla_state->d.final.transition_offset;
|
316
|
+
frame = pop_gla_frame(s);
|
317
|
+
if(offset == 0)
|
318
|
+
status = pop_rtn_frame(s);
|
319
|
+
else {
|
320
|
+
struct gzl_rtn_state *rtn_state = frame->f.rtn_frame.rtn_state;
|
321
|
+
struct gzl_rtn_transition *t = &rtn_state->transitions[offset-1];
|
322
|
+
struct gzl_terminal *next_term = &s->token_buffer[*rtn_term_offset];
|
323
|
+
if(t->transition_type == GZL_TERMINAL_TRANSITION) {
|
324
|
+
/* The transition must match what we have in the token buffer */
|
325
|
+
assert(next_term->name == t->edge.terminal_name);
|
326
|
+
(*rtn_term_offset)++;
|
327
|
+
status = do_rtn_terminal_transition(s, t, next_term);
|
328
|
+
} else
|
329
|
+
status = push_rtn_frame_for_transition(
|
330
|
+
s, t, &next_term->offset);
|
331
|
+
}
|
332
|
+
}
|
333
|
+
return status;
|
334
|
+
}
|
335
|
+
|
336
|
+
/*
|
337
|
+
* process_terminal(): processes a terminal that was just lexed, possibly
|
338
|
+
* triggering a series of RTN and/or GLA transitions.
|
339
|
+
*
|
340
|
+
* Preconditions:
|
341
|
+
* - the current stack frame is an intfa frame representing the intfa that
|
342
|
+
* just produced this terminal
|
343
|
+
* - the given terminal can be recognized by the current GLA or RTN state
|
344
|
+
*
|
345
|
+
* Postconditions:
|
346
|
+
* - the current stack frame is an GLA or RTN frame representing the state after
|
347
|
+
* all available GLA and RTN transitions have been taken.
|
348
|
+
*/
|
349
|
+
|
350
|
+
static
|
351
|
+
enum gzl_status process_terminal(struct gzl_parse_state *s, char *term_name,
|
352
|
+
struct gzl_offset *start_offset, int len)
|
353
|
+
{
|
354
|
+
pop_intfa_frame(s);
|
355
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
356
|
+
size_t rtn_term_offset = 0;
|
357
|
+
size_t gla_term_offset = s->token_buffer_len;
|
358
|
+
|
359
|
+
RESIZE_DYNARRAY(s->token_buffer, s->token_buffer_len+1);
|
360
|
+
if(s->token_buffer_len >= s->max_lookahead)
|
361
|
+
return GZL_STATUS_RESOURCE_LIMIT_EXCEEDED;
|
362
|
+
|
363
|
+
struct gzl_terminal *term = DYNARRAY_GET_TOP(s->token_buffer);
|
364
|
+
term->name = term_name;
|
365
|
+
term->offset = *start_offset;
|
366
|
+
term->len = len;
|
367
|
+
|
368
|
+
/* Feed tokens to RTNs and GLAs until we have processed all the tokens we
|
369
|
+
* have. */
|
370
|
+
enum gzl_status status = GZL_STATUS_OK;
|
371
|
+
enum gzl_frame_type frame_type = frame->frame_type;
|
372
|
+
do {
|
373
|
+
/* Take one terminal transition, for either an RTN or a GLA. */
|
374
|
+
if(frame_type == GZL_FRAME_TYPE_RTN) {
|
375
|
+
struct gzl_terminal *rtn_term = &s->token_buffer[rtn_term_offset];
|
376
|
+
struct gzl_rtn_transition *t;
|
377
|
+
rtn_term_offset++;
|
378
|
+
|
379
|
+
if(rtn_term->name == NULL)
|
380
|
+
/* Skip: RTNs don't process EOF as a terminal, only GLAs do. */
|
381
|
+
continue;
|
382
|
+
t = find_rtn_terminal_transition(frame->f.rtn_frame.rtn_state,
|
383
|
+
rtn_term);
|
384
|
+
if(!t) {
|
385
|
+
/* Parse error: terminal for which we had no RTN transition. */
|
386
|
+
if(s->bound_grammar->error_terminal_cb)
|
387
|
+
s->bound_grammar->error_terminal_cb(s, term);
|
388
|
+
return GZL_STATUS_ERROR;
|
389
|
+
}
|
390
|
+
status = do_rtn_terminal_transition(s, t, rtn_term);
|
391
|
+
} else {
|
392
|
+
struct gzl_terminal *gla_term = &s->token_buffer[gla_term_offset++];
|
393
|
+
status = do_gla_transition(s, gla_term, &rtn_term_offset);
|
394
|
+
}
|
395
|
+
|
396
|
+
/* Having taken a transition, push any new frames onto the stack. */
|
397
|
+
if(status == GZL_STATUS_OK) {
|
398
|
+
bool entered_gla;
|
399
|
+
if(rtn_term_offset < s->token_buffer_len)
|
400
|
+
status = descend_to_gla(
|
401
|
+
s, &entered_gla, &s->token_buffer[rtn_term_offset].offset);
|
402
|
+
else
|
403
|
+
status = descend_to_gla(s, &entered_gla, &s->offset);
|
404
|
+
|
405
|
+
if(entered_gla)
|
406
|
+
gla_term_offset = rtn_term_offset;
|
407
|
+
}
|
408
|
+
|
409
|
+
if(status == GZL_STATUS_OK) {
|
410
|
+
assert(s->parse_stack_len > 0);
|
411
|
+
frame = DYNARRAY_GET_TOP(s->parse_stack);
|
412
|
+
frame_type = frame->frame_type;
|
413
|
+
}
|
414
|
+
}
|
415
|
+
while(status == GZL_STATUS_OK &&
|
416
|
+
((frame_type == GZL_FRAME_TYPE_RTN &&
|
417
|
+
rtn_term_offset < s->token_buffer_len) ||
|
418
|
+
(frame_type == GZL_FRAME_TYPE_GLA &&
|
419
|
+
gla_term_offset < s->token_buffer_len)));
|
420
|
+
|
421
|
+
/* We can have an EOF left over in the token buffer if the EOF token led us
|
422
|
+
* to a hard EOF, thus terminating the above loop before our "skip" above
|
423
|
+
* could cover this EOF special case. */
|
424
|
+
if(rtn_term_offset < s->token_buffer_len &&
|
425
|
+
s->token_buffer[rtn_term_offset].name == NULL)
|
426
|
+
rtn_term_offset++;
|
427
|
+
|
428
|
+
/* At this point we have consumed some (but possibly not all) of the
|
429
|
+
* terminals we have lexed. We consider a token fully consumed when it
|
430
|
+
* has caused an RTN transition (just a GLA transition doesn't leave the
|
431
|
+
* token consumed, because it will be used again for an RTN transition
|
432
|
+
* later.
|
433
|
+
*
|
434
|
+
* We now remove the consumed terminals from token_buffer. */
|
435
|
+
size_t remaining_terminals = s->token_buffer_len - rtn_term_offset;
|
436
|
+
if(remaining_terminals > 0)
|
437
|
+
memmove(s->token_buffer, s->token_buffer + rtn_term_offset,
|
438
|
+
remaining_terminals * sizeof(*s->token_buffer));
|
439
|
+
RESIZE_DYNARRAY(s->token_buffer, remaining_terminals);
|
440
|
+
|
441
|
+
/* Update open_terminal_offset. */
|
442
|
+
if(remaining_terminals > 0)
|
443
|
+
s->open_terminal_offset = s->token_buffer[0].offset;
|
444
|
+
else
|
445
|
+
s->open_terminal_offset = s->offset;
|
446
|
+
|
447
|
+
return status;
|
448
|
+
}
|
449
|
+
|
450
|
+
|
451
|
+
/*
|
452
|
+
* do_intfa_transition(): transitions an IntFA frame according to the given
|
453
|
+
* char, performing the appropriate GLA/RTN transitions if this puts the IntFA
|
454
|
+
* in a final state.
|
455
|
+
*
|
456
|
+
* Preconditions:
|
457
|
+
* - the current stack frame is an IntFA frame
|
458
|
+
*
|
459
|
+
* Postconditions:
|
460
|
+
* - the current stack frame is an IntFA frame unless we have hit a
|
461
|
+
* hard EOF in which case it is an RTN frame. Note that it could be either
|
462
|
+
* same IntFA frame or a different one.
|
463
|
+
*
|
464
|
+
* Note: we currently implement longest-match, assuming that the first
|
465
|
+
* non-matching character is only one longer than the longest match.
|
466
|
+
*/
|
467
|
+
static
|
468
|
+
enum gzl_status do_intfa_transition(struct gzl_parse_state *s,
|
469
|
+
char ch)
|
470
|
+
{
|
471
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
472
|
+
assert(frame->frame_type == GZL_FRAME_TYPE_INTFA);
|
473
|
+
struct gzl_intfa_frame *intfa_frame = &frame->f.intfa_frame;
|
474
|
+
struct gzl_intfa_transition *t = find_intfa_transition(
|
475
|
+
intfa_frame->intfa_state, ch);
|
476
|
+
enum gzl_status status;
|
477
|
+
|
478
|
+
/* If this character did not have any transition, but the state we're coming
|
479
|
+
* from is final, then longest-match semantics say that we should return
|
480
|
+
* the last character's final state as the token. But if the state we're
|
481
|
+
* coming from is *not* final, it's just a parse error. */
|
482
|
+
if(!t) {
|
483
|
+
char *terminal = intfa_frame->intfa_state->final;
|
484
|
+
assert(terminal); /* TODO: handle this case. */
|
485
|
+
status = process_terminal(s, terminal, &frame->start_offset,
|
486
|
+
s->offset.byte - frame->start_offset.byte);
|
487
|
+
if(status != GZL_STATUS_OK) return status;
|
488
|
+
intfa_frame = push_intfa_frame_for_gla_or_rtn(s);
|
489
|
+
t = find_intfa_transition(intfa_frame->intfa_state, ch);
|
490
|
+
if(!t) {
|
491
|
+
/* Parse error: we encountered a character for which we have no
|
492
|
+
* transition. */
|
493
|
+
if(s->bound_grammar->error_char_cb)
|
494
|
+
s->bound_grammar->error_char_cb(s, ch);
|
495
|
+
return GZL_STATUS_ERROR;
|
496
|
+
}
|
497
|
+
}
|
498
|
+
|
499
|
+
/* We have finished processing transitions for the previous byte.
|
500
|
+
* Move on to the next byte. */
|
501
|
+
s->offset.byte++;
|
502
|
+
|
503
|
+
/* Deal with newlines. This is all very single-byte-encoding specific for
|
504
|
+
* the moment. */
|
505
|
+
bool is_newline_char = (ch == 0x0A || ch == 0x0D); /* LF and CR */
|
506
|
+
if(is_newline_char) {
|
507
|
+
if(!s->last_char_was_newline) {
|
508
|
+
s->offset.line++;
|
509
|
+
s->offset.column = 1;
|
510
|
+
}
|
511
|
+
}
|
512
|
+
else
|
513
|
+
s->offset.column++;
|
514
|
+
s->last_char_was_newline = is_newline_char;
|
515
|
+
|
516
|
+
/* Do the transition. */
|
517
|
+
intfa_frame->intfa_state = t->dest_state;
|
518
|
+
|
519
|
+
/* If the current state is final and there are no outgoing transitions,
|
520
|
+
* we *know* we don't have to wait any longer for the longest match.
|
521
|
+
* Transition the RTN or GLA now, for more on-line behavior. */
|
522
|
+
if(intfa_frame->intfa_state->final &&
|
523
|
+
(intfa_frame->intfa_state->num_transitions == 0)) {
|
524
|
+
status = process_terminal(s, intfa_frame->intfa_state->final,
|
525
|
+
&frame->start_offset,
|
526
|
+
s->offset.byte - frame->start_offset.byte);
|
527
|
+
if(status != GZL_STATUS_OK)
|
528
|
+
return status;
|
529
|
+
push_intfa_frame_for_gla_or_rtn(s);
|
530
|
+
}
|
531
|
+
return GZL_STATUS_OK;
|
532
|
+
}
|
533
|
+
|
534
|
+
/*
|
535
|
+
* The rest of this file is the publicly-exposed API, documented in the
|
536
|
+
* header file.
|
537
|
+
*/
|
538
|
+
|
539
|
+
enum gzl_status gzl_parse(struct gzl_parse_state *s, char *buf, size_t buf_len)
|
540
|
+
{
|
541
|
+
enum gzl_status status = GZL_STATUS_OK;
|
542
|
+
size_t i;
|
543
|
+
|
544
|
+
/* For the first call, we need to push the initial frame and
|
545
|
+
* descend from the starting frame until we hit an IntFA frame. */
|
546
|
+
if(s->offset.byte == 0 && s->parse_stack_len == 0) {
|
547
|
+
push_rtn_frame(s, &s->bound_grammar->grammar->rtns[0], &s->offset);
|
548
|
+
bool entered_gla;
|
549
|
+
status = descend_to_gla(s, &entered_gla, &s->offset);
|
550
|
+
if(status == GZL_STATUS_OK) push_intfa_frame_for_gla_or_rtn(s);
|
551
|
+
}
|
552
|
+
if(s->parse_stack_len == 0) {
|
553
|
+
/* This gzl_parse_state has already hit hard EOF previously. */
|
554
|
+
return GZL_STATUS_HARD_EOF;
|
555
|
+
}
|
556
|
+
|
557
|
+
for(i = 0; i < buf_len && status == GZL_STATUS_OK; i++)
|
558
|
+
status = do_intfa_transition(s, buf[i]);
|
559
|
+
return status;
|
560
|
+
}
|
561
|
+
|
562
|
+
bool gzl_finish_parse(struct gzl_parse_state *s)
|
563
|
+
{
|
564
|
+
size_t i;
|
565
|
+
/* First deal with an open IntFA frame if there is one. The frame must
|
566
|
+
* be in a start state (in which case we back it out), a final state
|
567
|
+
* (in which case we recognize and process the terminal), or both (in
|
568
|
+
* which case we back out iff. we are in a GLA state with an EOF transition
|
569
|
+
* out). */
|
570
|
+
struct gzl_parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
|
571
|
+
if(frame->frame_type == GZL_FRAME_TYPE_INTFA) {
|
572
|
+
struct gzl_intfa_frame *intfa_frame = &frame->f.intfa_frame;
|
573
|
+
if(intfa_frame->intfa_state->final &&
|
574
|
+
intfa_frame->intfa_state == &intfa_frame->intfa->states[0]) {
|
575
|
+
/* TODO: handle this case. */
|
576
|
+
assert(false);
|
577
|
+
} else if(intfa_frame->intfa_state->final) {
|
578
|
+
process_terminal(s, intfa_frame->intfa_state->final,
|
579
|
+
&frame->start_offset,
|
580
|
+
s->offset.byte - frame->start_offset.byte);
|
581
|
+
} else if(intfa_frame->intfa_state == &intfa_frame->intfa->states[0]) {
|
582
|
+
/* Pop the frame like it never happened. */
|
583
|
+
pop_intfa_frame(s);
|
584
|
+
} else {
|
585
|
+
/* IntFA is in neither a start nor a final state.
|
586
|
+
* This cannot be EOF. */
|
587
|
+
return false;
|
588
|
+
}
|
589
|
+
}
|
590
|
+
|
591
|
+
/* Next deal with an open GLA frame if there is one. The frame must be in
|
592
|
+
* a start state or have an outgoing EOF transition, else we are not at
|
593
|
+
* valid EOF. */
|
594
|
+
frame = DYNARRAY_GET_TOP(s->parse_stack);
|
595
|
+
if(frame->frame_type == GZL_FRAME_TYPE_GLA) {
|
596
|
+
struct gzl_gla_frame *gla_frame = &frame->f.gla_frame;
|
597
|
+
if(gla_frame->gla_state == &gla_frame->gla->states[0]) {
|
598
|
+
/* GLA is in a start state -- fine, we can just pop it as
|
599
|
+
* if it never happened. */
|
600
|
+
pop_gla_frame(s);
|
601
|
+
} else {
|
602
|
+
/* For this to still be valid EOF, this GLA state must have an
|
603
|
+
* outgoing EOF transition, and we must take it now. */
|
604
|
+
struct gzl_gla_transition *t =
|
605
|
+
find_gla_transition(gla_frame->gla_state, NULL);
|
606
|
+
if(!t) return false;
|
607
|
+
|
608
|
+
/* process_terminal() wants an IntFA frame to pop. */
|
609
|
+
push_empty_frame(s, GZL_FRAME_TYPE_INTFA, &s->offset);
|
610
|
+
process_terminal(s, NULL, &s->offset, 0);
|
611
|
+
|
612
|
+
/* Pop any GLA states that the previous may have pushed. */
|
613
|
+
while(s->parse_stack_len > 0 &&
|
614
|
+
DYNARRAY_GET_TOP(s->parse_stack)->frame_type !=
|
615
|
+
GZL_FRAME_TYPE_RTN)
|
616
|
+
pop_frame(s);
|
617
|
+
}
|
618
|
+
}
|
619
|
+
|
620
|
+
/* Now we should have only RTN frames open. Starting from the top, check
|
621
|
+
* that each frame's dest_state is a final state (or the actual current
|
622
|
+
* state in the bottommost frame). */
|
623
|
+
if(s->parse_stack_len > 0) { /* will be 0 if we already hit hard EOF. */
|
624
|
+
for(i = 0; i < s->parse_stack_len - 1; i++) {
|
625
|
+
frame = &s->parse_stack[i];
|
626
|
+
assert(frame->frame_type == GZL_FRAME_TYPE_RTN);
|
627
|
+
struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
|
628
|
+
assert(rtn_frame->rtn_transition);
|
629
|
+
if(!rtn_frame->rtn_transition->dest_state->is_final) return false;
|
630
|
+
}
|
631
|
+
|
632
|
+
frame = DYNARRAY_GET_TOP(s->parse_stack);
|
633
|
+
struct gzl_rtn_frame *rtn_frame = &frame->f.rtn_frame;
|
634
|
+
if(!rtn_frame->rtn_state->is_final) return false;
|
635
|
+
|
636
|
+
/* We are truly in a state where EOF is ok. Pop remaining RTN frames to
|
637
|
+
* call callbacks appropriately. */
|
638
|
+
while(s->parse_stack_len > 0)
|
639
|
+
{
|
640
|
+
/* What should we do if the user cancels while the final RTN frames
|
641
|
+
* are being popped? It's kind of a weird thing to do. Options
|
642
|
+
* are to ignore it (we're finishing the parse anyway) or to stop.
|
643
|
+
* For now we ignore. */
|
644
|
+
pop_rtn_frame(s);
|
645
|
+
}
|
646
|
+
}
|
647
|
+
|
648
|
+
return true;
|
649
|
+
}
|
650
|
+
|
651
|
+
struct gzl_parse_state *gzl_alloc_parse_state()
|
652
|
+
{
|
653
|
+
struct gzl_parse_state *state = malloc(sizeof(*state));
|
654
|
+
INIT_DYNARRAY(state->parse_stack, 0, 16);
|
655
|
+
INIT_DYNARRAY(state->token_buffer, 0, 2);
|
656
|
+
return state;
|
657
|
+
}
|
658
|
+
|
659
|
+
struct gzl_parse_state *gzl_dup_parse_state(struct gzl_parse_state *orig)
|
660
|
+
{
|
661
|
+
struct gzl_parse_state *copy = malloc(sizeof(*copy));
|
662
|
+
/* This erroneously copies pointers to dynarrays, but we'll fix in a sec. */
|
663
|
+
*copy = *orig;
|
664
|
+
size_t i;
|
665
|
+
|
666
|
+
INIT_DYNARRAY(copy->parse_stack, 0, 16);
|
667
|
+
RESIZE_DYNARRAY(copy->parse_stack, orig->parse_stack_len);
|
668
|
+
for(i = 0; i < orig->parse_stack_len; i++)
|
669
|
+
copy->parse_stack[i] = orig->parse_stack[i];
|
670
|
+
|
671
|
+
INIT_DYNARRAY(copy->token_buffer, 0, 2);
|
672
|
+
RESIZE_DYNARRAY(copy->token_buffer, orig->token_buffer_len);
|
673
|
+
for(i = 0; i < orig->token_buffer_len; i++)
|
674
|
+
copy->token_buffer[i] = orig->token_buffer[i];
|
675
|
+
|
676
|
+
return copy;
|
677
|
+
}
|
678
|
+
|
679
|
+
void gzl_free_parse_state(struct gzl_parse_state *s)
|
680
|
+
{
|
681
|
+
FREE_DYNARRAY(s->parse_stack);
|
682
|
+
FREE_DYNARRAY(s->token_buffer);
|
683
|
+
free(s);
|
684
|
+
}
|
685
|
+
|
686
|
+
void gzl_init_parse_state(struct gzl_parse_state *s,
|
687
|
+
struct gzl_bound_grammar *bg)
|
688
|
+
{
|
689
|
+
s->offset.byte = 0;
|
690
|
+
s->offset.line = 1;
|
691
|
+
s->offset.column = 1;
|
692
|
+
s->open_terminal_offset = s->offset;
|
693
|
+
s->last_char_was_newline = false;
|
694
|
+
s->bound_grammar = bg;
|
695
|
+
RESIZE_DYNARRAY(s->parse_stack, 0);
|
696
|
+
RESIZE_DYNARRAY(s->token_buffer, 0);
|
697
|
+
|
698
|
+
/* Currently each stack frame takes 28 bytes on a 32-bit machine, so a
|
699
|
+
* stack depth of 500 is a modest 14kb of RAM. 500 frames of recursion is
|
700
|
+
* far deeper than we would expect any real text to be */
|
701
|
+
s->max_stack_depth = 500;
|
702
|
+
|
703
|
+
/* Currently each token of lookahead takes 20 bytes on a 32-bit machine, so
|
704
|
+
* a lookahead depth of 500 is 10kb of RAM. Input text would have to be
|
705
|
+
* truly pathological to require this much lookahead. */
|
706
|
+
s->max_lookahead = 500;
|
707
|
+
}
|
708
|
+
|
709
|
+
enum gzl_status gzl_parse_file(struct gzl_parse_state *state,
|
710
|
+
FILE *file, void *user_data,
|
711
|
+
size_t max_buffer_size)
|
712
|
+
{
|
713
|
+
struct gzl_buffer *buffer = malloc(sizeof(*buffer));
|
714
|
+
INIT_DYNARRAY(buffer->buf, 0, 4096);
|
715
|
+
buffer->buf_offset = 0;
|
716
|
+
buffer->bytes_parsed = 0;
|
717
|
+
buffer->user_data = user_data;
|
718
|
+
state->user_data = buffer;
|
719
|
+
|
720
|
+
/* The minimum amount of the data in the buffer that we want to be new data
|
721
|
+
* each time. This number shrinks as the amount of data we're preserving
|
722
|
+
* from open tokens grows. If the number is below this number we increase
|
723
|
+
* our buffer size. */
|
724
|
+
const size_t min_new_data = 4000;
|
725
|
+
|
726
|
+
enum gzl_status status;
|
727
|
+
bool is_eof = false;
|
728
|
+
do {
|
729
|
+
/* Make sure we have space for at least min_new_data new data. */
|
730
|
+
size_t new_buf_size = buffer->buf_size;
|
731
|
+
while(buffer->buf_len + min_new_data > new_buf_size)
|
732
|
+
buffer->buf_size *= 2;
|
733
|
+
if(new_buf_size > max_buffer_size) {
|
734
|
+
status = GZL_STATUS_RESOURCE_LIMIT_EXCEEDED;
|
735
|
+
break;
|
736
|
+
}
|
737
|
+
if(new_buf_size != buffer->buf_size) {
|
738
|
+
buffer->buf_size = new_buf_size;
|
739
|
+
buffer->buf = realloc(buffer->buf, new_buf_size);
|
740
|
+
}
|
741
|
+
size_t bytes_to_read = buffer->buf_size - buffer->buf_len;
|
742
|
+
|
743
|
+
/* Do the I/O and check for errors. */
|
744
|
+
size_t bytes_read = fread(buffer->buf + buffer->buf_len, 1,
|
745
|
+
bytes_to_read, file);
|
746
|
+
if(bytes_read < bytes_to_read) {
|
747
|
+
if(ferror(file)) {
|
748
|
+
status = GZL_STATUS_IO_ERROR;
|
749
|
+
break;
|
750
|
+
} else if(feof(file)) {
|
751
|
+
is_eof = true;
|
752
|
+
}
|
753
|
+
}
|
754
|
+
|
755
|
+
/* Do the parse. Start past whatever bytes we previously saved. */
|
756
|
+
char *parse_start = buffer->buf + buffer->buf_len;
|
757
|
+
buffer->buf_len += bytes_read;
|
758
|
+
status = gzl_parse(state, parse_start, bytes_read);
|
759
|
+
|
760
|
+
/* Preserve all data from tokens that haven't been returned yet:
|
761
|
+
*
|
762
|
+
* buf size len
|
763
|
+
* | | |
|
764
|
+
* v v v
|
765
|
+
* Buffer: -----------------------------------------------------------
|
766
|
+
* ^ ^ ^ ^
|
767
|
+
* | | | |
|
768
|
+
* buf_offset | | state->offset
|
769
|
+
* | |
|
770
|
+
* previous value of current value of
|
771
|
+
* state->open_terminal_offset state->open_terminal_offset
|
772
|
+
*
|
773
|
+
* |----| <-- Data we were previously saving.
|
774
|
+
*
|
775
|
+
* Data we should now be saving --> |------------|
|
776
|
+
*/
|
777
|
+
|
778
|
+
size_t bytes_to_discard = state->open_terminal_offset.byte -
|
779
|
+
buffer->buf_offset;
|
780
|
+
size_t bytes_to_save = buffer->buf_size - bytes_to_discard;
|
781
|
+
char *buf_to_save_from = buffer->buf + bytes_to_discard;
|
782
|
+
assert(bytes_to_discard <= (size_t) buffer->buf_len); /* hasn't overflowed. */
|
783
|
+
|
784
|
+
memmove(buffer->buf, buf_to_save_from, bytes_to_save);
|
785
|
+
buffer->buf_offset += bytes_to_discard;
|
786
|
+
buffer->buf_len = bytes_to_save;
|
787
|
+
} while(status == GZL_STATUS_OK && !is_eof);
|
788
|
+
|
789
|
+
if(status == GZL_STATUS_HARD_EOF || (status == GZL_STATUS_OK && is_eof)) {
|
790
|
+
if(gzl_finish_parse(state)) {
|
791
|
+
if(!feof(file) || buffer->buf_len > 0) {
|
792
|
+
/* There was data left over -- we hit grammar EOF before
|
793
|
+
* file EOF. */
|
794
|
+
status = GZL_STATUS_OK;
|
795
|
+
} else
|
796
|
+
status = GZL_STATUS_PREMATURE_EOF_ERROR;
|
797
|
+
} else
|
798
|
+
status = GZL_STATUS_PREMATURE_EOF_ERROR;
|
799
|
+
}
|
800
|
+
|
801
|
+
FREE_DYNARRAY(buffer->buf);
|
802
|
+
free(buffer);
|
803
|
+
return status;
|
804
|
+
}
|
805
|
+
|
806
|
+
/*
|
807
|
+
* Local Variables:
|
808
|
+
* c-file-style: "bsd"
|
809
|
+
* c-basic-offset: 4
|
810
|
+
* indent-tabs-mode: nil
|
811
|
+
* End:
|
812
|
+
* vim:et:sts=4:sw=4
|
813
|
+
*/
|