gullah 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a node just for trash
5
+ class Trash < Node # :nodoc:
6
+ # does this node represent a character sequence no leaf rule matched?
7
+ def trash?
8
+ true
9
+ end
10
+
11
+ def boundary?
12
+ true
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # This is an alpha release. Gullah has not yet been used in anything
5
+ # other than unit tests.
6
+ VERSION = '0.0.0'
7
+ end
data/lib/gullah.rb ADDED
@@ -0,0 +1,777 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+ %w[version atom error hopper leaf node trash boundary parse rule iterator dotifier segment picker].each do |s|
5
+ require "gullah/#{s}"
6
+ end
7
+
8
+ # A collection of class methods that can be added into a class to make it parser.
9
+ # For example:
10
+ #
11
+ # class Foo
12
+ # extend Gullah
13
+ #
14
+ # rule :plugh, 'foo bar+ | bar foo{1,3}'
15
+ # rule :foo, 'number word'
16
+ # rule :bar, 'punctuation "wow!"'
17
+ # leaf :word, /[a-z]+/i
18
+ # leaf :number, /\d+(?:\.\d+)?/
19
+ # leaf :punctuation, /[^\w\s]+/
20
+ # end
21
+ #
22
+ # Having defined a grammar like this, one can apply it to arbitrary strings to
23
+ # generate parse trees:
24
+ #
25
+ # Foo.parse "123 cat @#$ wow! ___wow!"
26
+ #
27
+ # Gullah can produce parse trees from incomplete or ambiguous grammars. It can handle
28
+ # noisy data. One can apply arbitrary tests to parse nodes, including tests that
29
+ # depend on other nodes in the parse tree. In the case of test failure the nature
30
+ # of the failure is marked on the corresponding nodes in the parse tree.
31
+ #
32
+ # = Syntax
33
+ #
34
+ # This section describes only the syntax of Gullah rules, not the entire API. Gullah
35
+ # syntax is generally the more familiar subset of the rules of regular expressions.
36
+ #
37
+ # - <b>sequence</b>
38
+ #
39
+ # rule :foo, 'bar baz' # one thing follows another
40
+ #
41
+ # - <b>alternation</b>
42
+ #
43
+ # rule :foo, 'bar | baz' # separate alternates with pipes
44
+ # rule :foo, 'plugh+' # or simply define it additional times (not regex grammar)
45
+ #
46
+ # Note, you can define all alternates by simple redefinition as in the second line
47
+ # above. You can use the pipe syntax for convenience. Any tests or preconditions
48
+ # provided with a particular definition of the rule <em>will apply only for that definition</em>.
49
+ #
50
+ # - <b>repetition</b>
51
+ #
52
+ # rule :option, 'foo?' # ? means "one or none"
53
+ # rule :plural, 'foo+' # + means "one or more"
54
+ # rule :options, 'foo*' # * means "zero or more"
55
+ # rule :n, 'foo{2}' # {n} means "exactly n"
56
+ # rule :n_plus, 'foo{2,}' # {n,} means "n or more"
57
+ # rule :n_m, 'foo{2,3}' # {n,m} means "between n and m"
58
+ #
59
+ # Note, though you can define rules like +option+ and +options+, a rule can't add
60
+ # a node to the parse tree if it matches nothing. These repetition suffixes are
61
+ # are more useful as part of a sequence. In practice <tt>foo?</tt> will be a less
62
+ # efficient version of <tt>foo</tt>, and <tt>foo*</tt>, a less efficient version of
63
+ # <tt>foo+</tt>.
64
+ #
65
+ # - <b>literals</b>
66
+ #
67
+ # rule :foo, '"(" bar ")"'
68
+ #
69
+ # Literals allow you to avoid defining simple leaf rules. The above is basically
70
+ # shorthand for
71
+ #
72
+ # rule :foo, 'left_paren bar right_paren'
73
+ # leaf :left_paren, /\(/
74
+ # leaf :right_paren, /\)/
75
+ #
76
+ # You may use either single or double quotes to define literals. You may also use
77
+ # escape sequences to include random characters in literals. Literals may have
78
+ # repetition suffixes.
79
+ #
80
+ # - <b>grouping</b>
81
+ #
82
+ # rule :foo, 'bar baz'
83
+ #
84
+ # Surprise! There is no grouping syntax in Gullah. Every rule is in effect a named group.
85
+ # So it might be better said that there are no anonymous groups in Gullah and grouping
86
+ # doesn't involve parentheses.
87
+ #
88
+ # You may be wondering about whitespace handling. See +ignore+ and +keep_whitespace+ below.
89
+ # The short version of it is that Gullah creates an ignorable whitespace leaf rule by
90
+ # default.
91
+ #
92
+ # = Preconditions
93
+ #
94
+ # The first step in adding a node to a parse tree is collecting a sequence of child
95
+ # nodes that match some rule. If the rule is
96
+ #
97
+ # rule :foo, 'bar+'
98
+ #
99
+ # you've collected a sequence of +bar+ nodes. If there is some condition you need this
100
+ # node to respect <em>which is dependent only on the rule and the child nodes</em> which you
101
+ # can't express, or not easily, in the rule itself, you can define one or more
102
+ # preconditions. E.g.,
103
+ #
104
+ # rule :foo, 'bar+', preconditions: %i[fibonacci]
105
+ #
106
+ # def fibonacci(_name, _start, _end, _text, children)
107
+ # is_fibonacci_number? children.length # assumes we've defined is_fibonacci_number?
108
+ # end
109
+ #
110
+ # A precondition is just an instance method defined in the Gullah-fied class with an arity
111
+ # of two: it takes the rule's name, a symbol, as its first argument, the start and end character
112
+ # offsets of the match as its second and third arguments, the text being parsed as its fourth argument,
113
+ # and the prospective child nodes, an array, as its last argument. If it returns a truthy value, the
114
+ # precondition holds and the node can be made. Otherwise, Gullah tries the next thing.
115
+ #
116
+ # == Preconditions versus Tests
117
+ #
118
+ # Preconditions are like tests (see below). They are further conditions on the building of
119
+ # nodes in a parse tree. Why does Gullah provide both? There are several reasons:
120
+ #
121
+ # - Preconditions are tested before the node is built, avoiding the overhead of cloning
122
+ # nodes, so they are considerably lighter weight.
123
+ # - Because they are tested <em>before</em> the node is built, they result in no partially erroneous
124
+ # parse in the event of failure, so they leave nothing Gullah will attempt to improve further
125
+ # at the cost of time.
126
+ # - But they don't leave a trace, so there's nothing to examine in the event of failure.
127
+ # - And they concern only the subtree rooted at the prospective node, so they cannot express
128
+ # structural relationships between this node and nodes which do not descend from it.
129
+ #
130
+ # <b>Note</b>: preconditions cannot tests relationships between <em>nodes</em> outside the prospective node's
131
+ # subtree, but they can test its relationships to adjoining <em>characters</em>, so they can
132
+ # implement lookarounds. For instance:
133
+ #
134
+ # def colon_after(_rule_or_leaf_name, _start_offset, end_offset, text, _children)
135
+ # text[end_offset..-1] =~ /\A\s*:/ # equivalent to (?=\s*:)
136
+ # end
137
+ #
138
+ # = Tests
139
+ #
140
+ # rule :all, 'baz+', tests: %i[half_even]
141
+ # rule :baz, 'foo | bar'
142
+ #
143
+ # leaf :foo, /\d+/, tests: %i[xenophilia]
144
+ # leaf :bar, /[a-z]/i, tests: %i[xenophilia]
145
+ #
146
+ # # node test!
147
+ #
148
+ # # half the digit characters under this node must be even, half, odd
149
+ # def half_even(node)
150
+ # even, odd = node.text.chars.select { |c| c =~ /\d/ }.partition { |c| c.to_i.even? }
151
+ # even.length == odd.length ? :pass : :fail
152
+ # end
153
+ #
154
+ # # structure test!
155
+ #
156
+ # # foos need bars and bars need foos
157
+ # def xenophilia(root, node)
158
+ # if root.name == :all
159
+ # sought = node.name == :foo ? :bar : :foo
160
+ # root.descendants.any? { |n| n.name == sought } ? :pass : :fail
161
+ # end
162
+ # end
163
+ #
164
+ # A special feature of Gullah is that you can add arbitrary tests to its rules. For example
165
+ # you can use a simple regular expression to match a date and then a test to do a sanity
166
+ # check to confirm that the parts of the date, the year, month, and day, combine to produce
167
+ # a real date on the calendar. This is better than simply writing a thorough regular expression
168
+ # because it gives you the opportunity to tell the user *how* a match failed rather than simply
169
+ # that it failed. This feature is Gullah's answer to such things as lookarounds and back
170
+ # references: you've matched a simple pattern; now does this pattern fit sanely with its context?
171
+ #
172
+ # There are two sorts of tests: node tests and structure tests. Node tests are tests that need
173
+ # only the node itself and its subtree as inputs. Structure tests are tests that depend on
174
+ # elements of the parse tree outside of the subtree rooted at the node itself.
175
+ #
176
+ # Tests are implemented as instance methods of the Gullah-fied class. If the method has an arity
177
+ # of one, it is a node test. Its single argument is the node matched. If it has an arity of two,
178
+ # it is a structure test. The first argument is an ancestor node of the node corresponding to the
179
+ # rule. The second argument is the node itself. Because structure tests cannot be run until the
180
+ # node has some ancestor, and then they might not apply to all ancestors, they can be in a "pending"
181
+ # state, where the test is queued to run but has not yet run.
182
+ #
183
+ # Tests must return one of four values: +:pass+, +:fail+, +:ignore+, or +nil+. Only structure
184
+ # tests may return +nil+, which indicates that the preconditions for the test have not yet been
185
+ # met. If a structure test returns +nil+, the test remains in a pending state and it will be run
186
+ # again when the node acquires a new ancestor.
187
+ #
188
+ # If a node test passes, the node is accepted into the parse tree. If it fails, the node is marked
189
+ # as erroneous and the particular cause of its failure is marked in the abstract syntax tree. If
190
+ # this tree is returned to the user, they will see this information. In addition to +:fail+, the rule
191
+ # may return more specific explanatory information:
192
+ #
193
+ # rule :word, /\w+/, tests: %i[we_want_foo!]
194
+ #
195
+ # def we_want_foo!(n)
196
+ # if n.text =~ /foo/
197
+ # :pass
198
+ # else
199
+ # [:fail, %Q[we really wanted to see "foo" but all we got was #{n.text.inspect}]]
200
+ # end
201
+ # end
202
+ #
203
+ # If a node returns +:pass+, the fact that the node passed the rule in question will be added to
204
+ # its +attributes+ hash in the AST.
205
+ #
206
+ # If a rule returns +:ignore+, this will constitute a pass, but no edits will be made to the AST.
207
+ #
208
+ # Tests short-circuit! If a node has many tests, they run until one fails.
209
+ #
210
+ # == Disadvantages of Tests
211
+ #
212
+ # All this being said, when tests <em>fail</em> they do so after their node has been built and added
213
+ # to a parse. This means their partially broken parse remains a candidate as Gullah tries to
214
+ # find the least bad way to parse the text it was given. This can be computationally expensive.
215
+ # If you can make do with preconditions (see above), they are the better choice.
216
+ #
217
+ # = Processors
218
+ #
219
+ # rule :word, /[a-z]+/i, process: :abbrv
220
+ # leaf :integer, /[1-9]\d*/, process: ->(n) { n.atts[:val] = n.text.to_i }
221
+ #
222
+ # def abbrv(node)
223
+ # node.attributes[:abbreviation] = node.text.gsub(/(?<!^)[aeiou]/, '')[0...5]
224
+ # end
225
+ #
226
+ # Any rule may have a +process+ named argument whose value is either a proc or a symbol.
227
+ # If it is a symbol, it must be the name of an instance method of the Gullah-fied class.
228
+ # In either case, the arity of the code in question must be one: its single argument will
229
+ # be the node created by the rule.
230
+ #
231
+ # The processing code may do anything -- log the event, provide a breakpoint -- but its
232
+ # expected use is to calculate and store some attribute of the node or its subtree in the
233
+ # node's attribute hash, most likely to accelerate other tests that will depend on this
234
+ # value. You may use this mechanism for other purposes, of course, to compile the text
235
+ # parsed into a more useful object, say, but because processing may occur on nodes which
236
+ # are later discarded in failed parses, it may be more efficient to defer such handling
237
+ # of the AST until the parse completes.
238
+ #
239
+ # Processors run after any tests have completed and only if they all pass.
240
+ #
241
+ # = Motivation
242
+ #
243
+ # Why does Gullah exist? Well, mostly because it seemed like fun to make it. I have made
244
+ # other grammar-adjacent things -- a recursive descent parser in Java inspired by the grammars
245
+ # of Raku, various regular expression optimization libraries in various languages, a simple
246
+ # grammar-esque regular expression enhancer for Rust that produces abstract syntax trees but
247
+ # can't handle recursion -- so I was thinking about the topic. A problem I faced with the recursive
248
+ # descent parser, which I later learned was a well-known problem, was infinite left-recursion.
249
+ # If you have a rule such as <tt>X -> X Y | Z</tt>, where an +X+ can be made of other +X+ es, your recursive
250
+ # descent parser constructs an infinitely long plan that never touches the data -- "I'll try an X, which
251
+ # means I'll first try an X, which means I'll first try an X..." The solution to this is to create an
252
+ # arbitrary, perhaps adjustable, recursion limit, recognize this pattern of recursion, and bail out
253
+ # when you find you've planned too long without executing anything. This is how I solved the problem in
254
+ # the library I wrote, but I found this unsatisfactory.
255
+ #
256
+ # An alternative solution, it occurred to me, was to start with the data rather than the plan. "I have
257
+ # an +X+. What can I make with this?" This instantly solves the left recursion problem, because the application
258
+ # of a rule must consume nodes, and it seems like a more
259
+ # reasonable way to parse things generally. As a latent linguist, this appealed to me as more psychologically
260
+ # realistic. Certainly people understand words in part by approaching language with expectations -- the top-down
261
+ # pattern you see in recursive descent -- but people are constantly confronted with text begun in the middle or
262
+ # interrupted or repaired mid-sentence, so they must be able as well to take the words they hear and try to
263
+ # make something from them. So I wanted to make a data-driven, bottom-up parser.
264
+ #
265
+ # (One thing I should say up front is that the design of Gullah is based entirely on my own pondering. I am not
266
+ # a very enthusiastic reader of other people's research. I am aware that a lot of work has been done on
267
+ # parsing and parser design, but the fun for me is in coming up with the ideas more than in doing the background
268
+ # reading, so I have just dived in. I am sure I have reinvented some wheels in this, mostly likely badly.)
269
+ #
270
+ # (Another aside: The left-recursion problem disappears with a bottom-up parser, which must consume data to proceed, but it
271
+ # is replaced with a unary-branching problem. If you have a rule that says an +A+ can be relabeled +B+ -- that
272
+ # is, you can add a node with a single child -- you risk an infinite loop. You may define rules such that +A+ becomes
273
+ # +B+, and another rule, or series of rules, which turns this +B+ back into an +A+. So this bottom-up parser has
274
+ # a somewhat unsatisfactory loop check as well.)
275
+ #
276
+ # A side benefit of bottom-up parsing is that it is robust against ill-formed data. If you can't make what you
277
+ # set out to make at least you can make something. And the structure you build out of the data can show very
278
+ # clearly where it has gone wrong. As a linguist, this appealed to my desire to model natural languages with
279
+ # all their noise and redundancy. As a programmer, this appealed to me as a way to make data problems
280
+ # transparent and solvable.
281
+ #
282
+ # = Efficiency
283
+ #
284
+ # I have taken care to make rules fail fast and have followed a dynamic programming model in which I cache
285
+ # information which would otherwise be recalculated in many recursions, but Gullah is certainly not as
286
+ # efficient as a parser custom designed for a particular language. A SAX parser of XML, for example, can
287
+ # process its input in linear time by pushing half-processed constructs onto a stack. The general mechanism
288
+ # underlying Gullah is worst-case quadratic, because events already seen may have to be scanned again to
289
+ # see whether recent decisions have changed whether they can be handled. If every node added to a
290
+ # provisional parse tree reduces the unprocessed node count by one and every scan on average finishes
291
+ # halfway through the unhandled nodes, this would mean n(n - 1)/2 comparisons to complete the tree. I doubt,
292
+ # though I cannot prove, that one could improve on this while maintaining one's parser's ability to handle
293
+ # broken data or ambiguous grammars. Ranking rules to try next based on past experience in the tree
294
+ # might improve the speed of parse discovery, but at the cost of greater complexity in the handling of any
295
+ # single scan.
296
+ #
297
+ # So if you have a particular data format or language you want to handle efficiently and you expect in most
298
+ # cases you will succeed without ambiguity on a single pass, Gullah is not the tool you want. But if you
299
+ # want to recover gracefully, it may be that a second pass with Gullah to produce the least bad parse and
300
+ # some information about how things went wrong is useful.
301
+
302
+ module Gullah
303
+ ##
304
+ # Define a tree structure rule. This specifies how tree nodes may be grouped under
305
+ # another node. The required arguments are +name+ and +body+. The former is a label
306
+ # for the node under which the others are grouped. The latter is a string defining
307
+ # the rule.
308
+ #
309
+ # rule :sequence, 'this then this'
310
+ #
311
+ # rule :quantifiers, 'foo bar? baz* plugh+ qux{2} quux{3,} corge{4,5}'
312
+ #
313
+ # rule :alternates, 'this | or | that'
314
+ # # you may also add alternates like so
315
+ # rule :alternates, 'also | these | and | those'
316
+ # rule :alternates, 'etc'
317
+ #
318
+ # rule :literals, %['this' "that"]
319
+ #
320
+ # rule :escapes, 'foo\\? "bar\\""'
321
+ #
322
+ # # the optional named arguments:
323
+ #
324
+ # rule :process, 'aha', process: ->(n) { log "Aha! we just matched #{n.text}!" }
325
+ # rule :or_maybe, 'oho', process: :some_arity_one_method_in_class_extending_gullah
326
+ #
327
+ # rule :tests, 'test me', tests: %i[node structure]
328
+ def rule(name, body, tests: [], preconditions: [], process: nil)
329
+ raise Error, 'tests must be an array' unless tests.is_a? Array
330
+ raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
331
+
332
+ init
333
+ init_check(name)
334
+ name = name.to_sym
335
+ body = body.to_s.strip.gsub(/\s+/, ' ')
336
+ return if dup_check(:rule, name, body, tests + preconditions)
337
+
338
+ tests << [process] if process
339
+ r = Rule.new name, body, tests: tests, preconditions: preconditions
340
+ subrules = r.subrules || [r]
341
+ subrules.each do |sr|
342
+ @rules << sr
343
+ sr.starters.each do |r, n|
344
+ (@starters[r] ||= []) << n
345
+ end
346
+ end
347
+ r.literals.each do |sym|
348
+ leaf sym.to_s, Regexp.new(quotemeta(sym.to_s))
349
+ end
350
+ end
351
+
352
+ ##
353
+ # Don't make whitespace automatically ignorable.
354
+ #
355
+ # class Foo
356
+ # extend Gullah
357
+ #
358
+ # keep_whitespace
359
+ #
360
+ # rule :a, 'a+'
361
+ # leaf :a, /a/
362
+ # end
363
+ #
364
+ # Foo.parse "aaa aaa"
365
+ #
366
+ # In this example, the parse tree would consist of two a nodes, each parent to three 'a' leaves,
367
+ # separated by a "trash" node corresponding to the whitespace, for which no leaf rule was provided.
368
+ def keep_whitespace
369
+ @keep_whitespace = true
370
+ end
371
+
372
+ ##
373
+ # A tokenization rule to divide the raw text into tokens to by matched by rules.
374
+ #
375
+ # The required arguments are a name and a regular expression. The name is what other
376
+ # rules will refer to. The regular expression of course defines the character sequence
377
+ # the rule matches. The more precise the regular expression the fewer false possibilities
378
+ # Gullah will have to sort through to find the best parse(s). Boundary markers in
379
+ # particular, +\b+ or lookarounds such as <tt>(?<!\d)</tt>, are helpful in this regard.
380
+ #
381
+ # The optional arguments are +tests+ and +process+. See +rule+ for more regarding these.
382
+ #
383
+ # leaf :word, /\b\w+\b/
384
+ # leaf :integer, /(?<!\d)[1-9]\d*(?!=\d)/, process: ->(n) { n.atts[:val] = n.text.to_i }
385
+ # leaf :name, /Bob/, tests: [:not_bobbing]
386
+ #
387
+ # def not_bobbing(n)
388
+ # /bing/.match(n.full_text, n.end) ? :fail : :pass
389
+ # end
390
+ def leaf(name, rx, tests: [], preconditions: [], process: nil)
391
+ _leaf name, rx, ignorable: false, tests: tests, process: process, preconditions: preconditions
392
+ end
393
+
394
+ ##
395
+ # A tokenization rule like +leaf+, but whose tokens are invisible to other rules.
396
+ # The +ignore+ method is otherwise identical to +leaf+.
397
+ #
398
+ # Unless +keep_whitespace+ is called, an +ignore+ rule covering whitespace will be
399
+ # generated automatically. It's name will be "_ws", or, if that is taken, "_wsN", where
400
+ # N is an integer sufficient to make this name unique among the rules of the grammar.
401
+ def ignore(name, rx, tests: [], preconditions: [], process: nil)
402
+ _leaf name, rx, ignorable: true, tests: tests, process: process, preconditions: []
403
+ end
404
+
405
+ ##
406
+ # A tokenization rule like +leaf+, but whose tokens cannot be the children of other nodes.
407
+ # The +ignore+ method is otherwise identical to +leaf+.
408
+ #
409
+ # Boundaries are extremely valuable for reducing the complexity of parsing, because Gullah
410
+ # knows no parse can span a boundary. Trash nodes -- nodes that correspond to character
411
+ # sequences unmatched by any leaf rule -- are also boundaries, though most likely erroneous
412
+ # ones.
413
+ #
414
+ # # clause boundary pattern
415
+ # boundary :terminal, /[.!?](?=\s*\z|\s+"?\p{Lu})|[:;]/
416
+ def boundary(name, rx, tests: [], preconditions: [], process: nil)
417
+ _leaf name, rx, boundary: true, tests: tests, preconditions: preconditions, process: process
418
+ end
419
+
420
+ ##
421
+ # Obtain the set of optimal parses of the given text. Optimality is determined
422
+ # by four criteria. In every case the smaller the number the better.
423
+ #
424
+ # correctness:: The count of node or structure tests that have failed.
425
+ # completion:: The count of root nodes.
426
+ # pending:: The count of structure tests that were not applied.
427
+ # size:: The total number of nodes.
428
+ #
429
+ # You can adjust the optimality conditions only by removing them via the optional
430
+ # +filters+ argument. If you supply this argument, only the optimality criteria you
431
+ # specify will be applied. The order of application is fixed: if parse A is more
432
+ # correct than parse B, it will be kept and B discarded even if B is more complete,
433
+ # has fewer pending tests, and fewer nodes.
434
+ #
435
+ # The optional +n+ parameter can be used to specify the desired number of parses.
436
+ # This is useful if your parse rules are ambiguous. For example, consider the grammar
437
+ #
438
+ # class Binary
439
+ # extend Gullah
440
+ # rule :a, 'a{2}'
441
+ # leaf :a, /\S+/
442
+ # end
443
+ #
444
+ # If you ask this to parse the string "a b c d e f g h i j k l" it will produce
445
+ # 58,786 equally good parses. These will consume a lot of memory and producing them
446
+ # will consume a lot of time. The +n+ parameter will let you get on with things faster.
447
+ #
448
+ # A caveat: Because of the way Gullah works you may not get exactly +n+ parses
449
+ # back when you ask for +n+. There may not be sufficiently many parses, of course, but
450
+ # you may also get back more than +n+ parses if the text you are parsing contains
451
+ # parsing boundaries. Gullah parses the portions of text inside the boundaries separately,
452
+ # so the number of possible parses will be the product of the number of parses of
453
+ # each bounded segment. If you have a sentence boundary in the middle of your text,
454
+ # and thus two segments, the number of parses of the entire text will be the number
455
+ # of parses of the first segment times the number of parses of the second. If the first
456
+ # has two parses and the second also has two but you ask for 3, the number of parses
457
+ # Gullah will find as it goes will be 1, then 2, then 4. There is no iteration of the
458
+ # process in which Gullah has found exactly 3 parses. The 4 it has found are necessarily
459
+ # all equally good, so rather than arbitrarily choosing 3 and discarding one, Gullah
460
+ # will return all 4.
461
+ def parse(text, filters: %i[correctness completion pending size], n: nil)
462
+ raise Error, 'n must be positive' if n&.zero?
463
+
464
+ commit
465
+ segments = segment(text.to_s, filters, n)
466
+ initial_segments = segments.select { |s| s.start.zero? }
467
+ if n
468
+ # iterate till all segments done or we get >= n parses
469
+ # another place to start parallelization
470
+ while (s = segments.reject(&:done).min_by(&:weight))
471
+ break if s.next && initial_segments.sum(&:total_parses) >= n
472
+ end
473
+ else
474
+ # iterate till all segments done
475
+ # NOTE: could be parallelized
476
+ while (s = segments.find { |s| !s.done })
477
+ s.next
478
+ end
479
+ end
480
+ if segments.length > 1
481
+ # pass the results through a new hopper to filter out duds
482
+ hopper = Hopper.new filters, nil
483
+ initial_segments.flat_map(&:results).each { |p| hopper << p }
484
+ hopper.dump.each(&:initialize_summaries)
485
+ else
486
+ segments.first.results
487
+ end
488
+ end
489
+
490
+ ##
491
+ # The first parse found. This takes the same arguments as +parse+ minus +n+.
492
+ # If there are no parses without errors or unsatisfied pending tree structure
493
+ # tests, it will be the first erroneous or incomplete parse.
494
+ #
495
+ # If you expect the parse to succeed and be unambiguous, this is the method you
496
+ # want.
497
+ def first(text, filters: %i[correctness completion pending size])
498
+ parse(text, filters: filters, n: 1).first
499
+ end
500
+
501
+ # :stopdoc:
502
+
503
+ private
504
+
505
+ def init
506
+ return if iv_check :@rules
507
+
508
+ @rules = []
509
+ @leaves = []
510
+ @starters = {}
511
+ @tests = {}
512
+ @preconditions = {}
513
+ @committed = false
514
+ @do_unary_branch_check = nil
515
+ end
516
+
517
+ def iv_check(var)
518
+ v = instance_variable_defined?(var) && instance_variable_get(var)
519
+ v && block_given? ? yield(v) : v
520
+ end
521
+
522
+ # do some sanity checking, initialization, and optimization
523
+ def commit
524
+ return if iv_check(:@committed)
525
+ raise Error, "#{name} has no leaves" unless iv_check(:@leaves, &:any?)
526
+
527
+ # add the whitespace rule unless told otherwise
528
+ if iv_check(:@keep_whitespace)
529
+ remove_instance_variable :@keep_whitespace
530
+ else
531
+ used_rules = (@rules.map(&:name) + @leaves.map(&:name)).to_set
532
+ base = '_ws'
533
+ count = nil
534
+ count = count.to_i + 1 while used_rules.include? "#{base}#{count}".to_sym
535
+ _leaf "#{base}#{count}".to_sym, /\s+/, ignorable: true
536
+ end
537
+
538
+ # vet on commit so rule definition is order-independent
539
+ [@leaves, @rules].flatten.each do |r|
540
+ vetted_tests = r.tests.map { |t| vet t }
541
+ vetted_preconds = r.preconditions.map { |pc| vet_precondition pc }
542
+ r._post_init(vetted_tests, vetted_preconds)
543
+ end
544
+ completeness_check
545
+ loop_check
546
+ # arrange things so we first try rules that can complete more of the parse;
547
+ # better would be sorting by frequency in parse trees, but we don't have
548
+ # that information
549
+ @starters.transform_values { |atoms| atoms.sort_by(&:max_consumption).reverse }
550
+ remove_instance_variable :@leaf_dup_check if iv_check(:@leaf_dup_check)
551
+ remove_instance_variable :@rule_dup_check if iv_check(:@rule_dup_check)
552
+ @committed = true
553
+ end
554
+
555
+ # has every rule/leaf required by some rule been defined?
556
+ def completeness_check
557
+ available = (@rules + @leaves).map(&:name).to_set
558
+ sought = @rules.flat_map(&:seeking).uniq.to_set
559
+ problems = sought.reject { |s| available.include? s }
560
+ raise Error, "the following rules or leaves remain undefined: #{problems.join(', ')}" if problems.any?
561
+ end
562
+
563
+ # define the @do_unary_branch_check variable
564
+ def loop_check
565
+ @do_unary_branch_check = false
566
+ links = @rules.select(&:potentially_unary?).flat_map(&:branches).uniq
567
+ if links.any?
568
+ potential_loops = links.map { |l| LoopCheck.new l }
569
+ catch :looped do
570
+ while potential_loops.any?
571
+ new_potential_loops = []
572
+ links.each do |l|
573
+ potential_loops.each do |pl|
574
+ if (npl = pl.add(l, self))
575
+ new_potential_loops << npl
576
+ end
577
+ end
578
+ end
579
+ potential_loops = new_potential_loops
580
+ end
581
+ end
582
+ end
583
+ end
584
+
585
+ class LoopCheck
586
+ def initialize(link)
587
+ @seen = Set.new(link)
588
+ @seeking = link.last
589
+ end
590
+
591
+ def add(link, grammar)
592
+ if @seeking == link.first
593
+ if @seen.include? link.last
594
+ grammar.instance_variable_set :@do_unary_branch_check, true
595
+ throw :looped
596
+ end
597
+ LoopCheck.new(@seen.to_a + [link.last])
598
+ end
599
+ end
600
+ end
601
+
602
+ def init_check(name)
603
+ raise Error, "cannot define #{name}; all rules must be defined before parsing" if @committed
604
+ end
605
+
606
+ # a tokenization rule to divide the raw text into tokens and separators ("ignorable" tokens)
607
+ def _leaf(name, rx, ignorable: false, boundary: false, tests: [], preconditions: [], process: nil)
608
+ raise Error, 'tests must be an array' unless tests.is_a? Array
609
+ raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
610
+
611
+ init
612
+ init_check(name)
613
+ name = name.to_sym
614
+ return if dup_check(:leaf, name, rx, tests + preconditions)
615
+
616
+ tests << [process] if process
617
+ @leaves << Leaf.new(name, rx, ignorable: ignorable, boundary: boundary, tests: tests, preconditions: preconditions)
618
+ end
619
+
620
+ # convert raw text into one or more arrays of leaf nodes -- maximally unreduced parses
621
+ def lex(text)
622
+ bases = [[0, Parse.new(text)]]
623
+ done = []
624
+ while bases.any?
625
+ offset, parse = bases.shift
626
+ added_any = false
627
+ @leaves.each do |leaf|
628
+ # can this leaf rule extract a leaf at this offset?
629
+ next unless (md = leaf.rx.match(text, offset)) && md.begin(0) == offset
630
+
631
+ e = md.end(0)
632
+ next if leaf.preconditions.any? { |pc| pc.call(leaf.name, offset, e, text, []) == :fail }
633
+
634
+ added_any = true
635
+ new_parse = parse.add(offset, e, leaf, @do_unary_branch_check, false, leaf.boundary)
636
+ if e == text.length
637
+ done << new_parse
638
+ else
639
+ bases << [e, new_parse]
640
+ end
641
+ end
642
+ next if added_any
643
+
644
+ # try to eliminate trash
645
+ trash_offset = text.length
646
+ @leaves.each do |leaf|
647
+ # is there a leaf like this closer to the current offset?
648
+ next unless
649
+ (md = leaf.rx.match(text, offset)) &&
650
+ (b = md.begin(0)) &&
651
+ (b < trash_offset) &&
652
+ (e = md.end(0)) &&
653
+ leaf.preconditions.none? { |pc| pc.call(leaf.name, b, e, text, []) == :fail }
654
+
655
+ trash_offset = b
656
+ end
657
+ new_parse = parse.add(offset, trash_offset, trash_rule, false, true)
658
+ if trash_offset == text.length
659
+ done << new_parse
660
+ else
661
+ bases << [trash_offset, new_parse]
662
+ end
663
+ end
664
+ done # an array of Parses
665
+ end
666
+
667
+ # slice text into independent segments
668
+ def segment(text, filters, n)
669
+ uncollected_segments = lex(text).flat_map(&:split)
670
+ segments = uncollected_segments.group_by { |s| [s.start, s.end] }.values.map do |segs|
671
+ Segment.new segs, filters, @starters, @do_unary_branch_check, n
672
+ end
673
+ segments.group_by(&:end).each do |final_offset, segs|
674
+ continuations = segments.select { |s| s.start == final_offset }
675
+ segs.each { |s| s.continuations = continuations }
676
+ end
677
+ segments
678
+ end
679
+
680
+ def trash_rule
681
+ @trash_rule ||= Leaf.new(:"", nil)
682
+ end
683
+
684
+ def singleton
685
+ @singleton ||= new
686
+ end
687
+
688
+ # check for duplicate rule/leaf
689
+ # return true if perfect duplicate, false if novel
690
+ def dup_check(type, name, body, tests)
691
+ set = type == :leaf ? (@leaf_dup_check ||= ::Set.new) : (@rule_dup_check ||= ::Set.new)
692
+ key = [name, body, tests.sort]
693
+ if set.include? key
694
+ true
695
+ else
696
+ set << key
697
+ false
698
+ end
699
+ end
700
+
701
+ # vet tests
702
+ def vet(test)
703
+ if test.is_a? Array
704
+ # this is a processing function, not a real test
705
+ return procify(test.first)
706
+ end
707
+
708
+ @tests[test] ||= begin
709
+ begin
710
+ m = singleton.method(test)
711
+ rescue ::NameError
712
+ raise Error, "#{test} is not defined"
713
+ end
714
+ raise Error, "#{test} must take either one or two arguments" unless (1..2).include? m.arity
715
+
716
+ m
717
+ end
718
+ end
719
+
720
+ # vet preconditions
721
+ def vet_precondition(precond)
722
+ @preconditions[precond] ||= begin
723
+ begin
724
+ m = singleton.method(precond)
725
+ rescue ::NameError
726
+ raise Error, "#{precond} is not defined"
727
+ end
728
+ raise Error, <<-MESSAGE.strip.gsub(/\s+/, ' ') unless m.arity == 5
729
+ #{precond} must take four arguments:
730
+ the rule or leaf name,
731
+ the start character offset,
732
+ the end character offset,
733
+ the text being parsed,
734
+ and the prospective children
735
+ MESSAGE
736
+
737
+ m
738
+ end
739
+ end
740
+
741
+ # escape a string literal for use in a regex
742
+ def quotemeta(str)
743
+ quoted = ''
744
+ (0...str.length).each do |i|
745
+ c = str[i]
746
+ quoted += '\\' if c =~ /[{}()\[\].?+*\\^$]/
747
+ quoted += c
748
+ end
749
+ quoted
750
+ end
751
+
752
+ def procify(processor)
753
+ case processor
754
+ when Symbol
755
+ @tests[processor] ||= begin
756
+ begin
757
+ m = singleton.method(processor)
758
+ rescue ::NameError
759
+ raise Error, "#{processor} is not defined"
760
+ end
761
+ raise Error, "#{processor} can only take a single argument" unless m.arity == 1
762
+
763
+ lambda { |n|
764
+ m.call(n) unless n.error?
765
+ return :ignore
766
+ }
767
+ end
768
+ when Proc
769
+ lambda { |n|
770
+ processor.call(n) unless n.error?
771
+ return :ignore
772
+ }
773
+ else
774
+ raise Error, 'a node processor can only be a proc or a symbol'
775
+ end
776
+ end
777
+ end