gullah 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a node just for trash
5
+ class Trash < Node # :nodoc:
6
+ # does this node represent a character sequence no leaf rule matched?
7
+ def trash?
8
+ true
9
+ end
10
+
11
+ def boundary?
12
+ true
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # This is an alpha release. Gullah has not yet been used in anything
5
+ # other than unit tests.
6
+ VERSION = '0.0.0'
7
+ end
data/lib/gullah.rb ADDED
@@ -0,0 +1,777 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+ %w[version atom error hopper leaf node trash boundary parse rule iterator dotifier segment picker].each do |s|
5
+ require "gullah/#{s}"
6
+ end
7
+
8
+ # A collection of class methods that can be added into a class to make it parser.
9
+ # For example:
10
+ #
11
+ # class Foo
12
+ # extend Gullah
13
+ #
14
+ # rule :plugh, 'foo bar+ | bar foo{1,3}'
15
+ # rule :foo, 'number word'
16
+ # rule :bar, 'punctuation "wow!"'
17
+ # leaf :word, /[a-z]+/i
18
+ # leaf :number, /\d+(?:\.\d+)?/
19
+ # leaf :punctuation, /[^\w\s]+/
20
+ # end
21
+ #
22
+ # Having defined a grammar like this, one can apply it to arbitrary strings to
23
+ # generate parse trees:
24
+ #
25
+ # Foo.parse "123 cat @#$ wow! ___wow!"
26
+ #
27
+ # Gullah can produce parse trees from incomplete or ambiguous grammars. It can handle
28
+ # noisy data. One can apply arbitrary tests to parse nodes, including tests that
29
+ # depend on other nodes in the parse tree. In the case of test failure the nature
30
+ # of the failure is marked on the corresponding nodes in the parse tree.
31
+ #
32
+ # = Syntax
33
+ #
34
+ # This section describes only the syntax of Gullah rules, not the entire API. Gullah
35
+ # syntax is generally the more familiar subset of the rules of regular expressions.
36
+ #
37
+ # - <b>sequence</b>
38
+ #
39
+ # rule :foo, 'bar baz' # one thing follows another
40
+ #
41
+ # - <b>alternation</b>
42
+ #
43
+ # rule :foo, 'bar | baz' # separate alternates with pipes
44
+ # rule :foo, 'plugh+' # or simply define it additional times (not regex grammar)
45
+ #
46
+ # Note, you can define all alternates by simple redefinition as in the second line
47
+ # above. You can use the pipe syntax for convenience. Any tests or preconditions
48
+ # provided with a particular definition of the rule <em>will apply only for that definition</em>.
49
+ #
50
+ # - <b>repetition</b>
51
+ #
52
+ # rule :option, 'foo?' # ? means "one or none"
53
+ # rule :plural, 'foo+' # + means "one or more"
54
+ # rule :options, 'foo*' # * means "zero or more"
55
+ # rule :n, 'foo{2}' # {n} means "exactly n"
56
+ # rule :n_plus, 'foo{2,}' # {n,} means "n or more"
57
+ # rule :n_m, 'foo{2,3}' # {n,m} means "between n and m"
58
+ #
59
+ # Note, though you can define rules like +option+ and +options+, a rule can't add
60
+ # a node to the parse tree if it matches nothing. These repetition suffixes are
61
+ # are more useful as part of a sequence. In practice <tt>foo?</tt> will be a less
62
+ # efficient version of <tt>foo</tt>, and <tt>foo*</tt>, a less efficient version of
63
+ # <tt>foo+</tt>.
64
+ #
65
+ # - <b>literals</b>
66
+ #
67
+ # rule :foo, '"(" bar ")"'
68
+ #
69
+ # Literals allow you to avoid defining simple leaf rules. The above is basically
70
+ # shorthand for
71
+ #
72
+ # rule :foo, 'left_paren bar right_paren'
73
+ # leaf :left_paren, /\(/
74
+ # leaf :right_paren, /\)/
75
+ #
76
+ # You may use either single or double quotes to define literals. You may also use
77
+ # escape sequences to include random characters in literals. Literals may have
78
+ # repetition suffixes.
79
+ #
80
+ # - <b>grouping</b>
81
+ #
82
+ # rule :foo, 'bar baz'
83
+ #
84
+ # Surprise! There is no grouping syntax in Gullah. Every rule is in effect a named group.
85
+ # So it might be better said that there are no anonymous groups in Gullah and grouping
86
+ # doesn't involve parentheses.
87
+ #
88
+ # You may be wondering about whitespace handling. See +ignore+ and +keep_whitespace+ below.
89
+ # The short version of it is that Gullah creates an ignorable whitespace leaf rule by
90
+ # default.
91
+ #
92
+ # = Preconditions
93
+ #
94
+ # The first step in adding a node to a parse tree is collecting a sequence of child
95
+ # nodes that match some rule. If the rule is
96
+ #
97
+ # rule :foo, 'bar+'
98
+ #
99
+ # you've collected a sequence of +bar+ nodes. If there is some condition you need this
100
+ # node to respect <em>which is dependent only on the rule and the child nodes</em> which you
101
+ # can't express, or not easily, in the rule itself, you can define one or more
102
+ # preconditions. E.g.,
103
+ #
104
+ # rule :foo, 'bar+', preconditions: %i[fibonacci]
105
+ #
106
+ # def fibonacci(_name, _start, _end, _text, children)
107
+ # is_fibonacci_number? children.length # assumes we've defined is_fibonacci_number?
108
+ # end
109
+ #
110
+ # A precondition is just an instance method defined in the Gullah-fied class with an arity
111
+ # of two: it takes the rule's name, a symbol, as its first argument, the start and end character
112
+ # offsets of the match as its second and third arguments, the text being parsed as its fourth argument,
113
+ # and the prospective child nodes, an array, as its last argument. If it returns a truthy value, the
114
+ # precondition holds and the node can be made. Otherwise, Gullah tries the next thing.
115
+ #
116
+ # == Preconditions versus Tests
117
+ #
118
+ # Preconditions are like tests (see below). They are further conditions on the building of
119
+ # nodes in a parse tree. Why does Gullah provide both? There are several reasons:
120
+ #
121
+ # - Preconditions are tested before the node is built, avoiding the overhead of cloning
122
+ # nodes, so they are considerably lighter weight.
123
+ # - Because they are tested <em>before</em> the node is built, they result in no partially erroneous
124
+ # parse in the event of failure, so they leave nothing Gullah will attempt to improve further
125
+ # at the cost of time.
126
+ # - But they don't leave a trace, so there's nothing to examine in the event of failure.
127
+ # - And they concern only the subtree rooted at the prospective node, so they cannot express
128
+ # structural relationships between this node and nodes which do not descend from it.
129
+ #
130
+ # <b>Note</b>: preconditions cannot tests relationships between <em>nodes</em> outside the prospective node's
131
+ # subtree, but they can test its relationships to adjoining <em>characters</em>, so they can
132
+ # implement lookarounds. For instance:
133
+ #
134
+ # def colon_after(_rule_or_leaf_name, _start_offset, end_offset, text, _children)
135
+ # text[end_offset..-1] =~ /\A\s*:/ # equivalent to (?=\s*:)
136
+ # end
137
+ #
138
+ # = Tests
139
+ #
140
+ # rule :all, 'baz+', tests: %i[half_even]
141
+ # rule :baz, 'foo | bar'
142
+ #
143
+ # leaf :foo, /\d+/, tests: %i[xenophilia]
144
+ # leaf :bar, /[a-z]/i, tests: %i[xenophilia]
145
+ #
146
+ # # node test!
147
+ #
148
+ # # half the digit characters under this node must be even, half, odd
149
+ # def half_even(node)
150
+ # even, odd = node.text.chars.select { |c| c =~ /\d/ }.partition { |c| c.to_i.even? }
151
+ # even.length == odd.length ? :pass : :fail
152
+ # end
153
+ #
154
+ # # structure test!
155
+ #
156
+ # # foos need bars and bars need foos
157
+ # def xenophilia(root, node)
158
+ # if root.name == :all
159
+ # sought = node.name == :foo ? :bar : :foo
160
+ # root.descendants.any? { |n| n.name == sought } ? :pass : :fail
161
+ # end
162
+ # end
163
+ #
164
+ # A special feature of Gullah is that you can add arbitrary tests to its rules. For example
165
+ # you can use a simple regular expression to match a date and then a test to do a sanity
166
+ # check to confirm that the parts of the date, the year, month, and day, combine to produce
167
+ # a real date on the calendar. This is better than simply writing a thorough regular expression
168
+ # because it gives you the opportunity to tell the user *how* a match failed rather than simply
169
+ # that it failed. This feature is Gullah's answer to such things as lookarounds and back
170
+ # references: you've matched a simple pattern; now does this pattern fit sanely with its context?
171
+ #
172
+ # There are two sorts of tests: node tests and structure tests. Node tests are tests that need
173
+ # only the node itself and its subtree as inputs. Structure tests are tests that depend on
174
+ # elements of the parse tree outside of the subtree rooted at the node itself.
175
+ #
176
+ # Tests are implemented as instance methods of the Gullah-fied class. If the method has an arity
177
+ # of one, it is a node test. Its single argument is the node matched. If it has an arity of two,
178
+ # it is a structure test. The first argument is an ancestor node of the node corresponding to the
179
+ # rule. The second argument is the node itself. Because structure tests cannot be run until the
180
+ # node has some ancestor, and then they might not apply to all ancestors, they can be in a "pending"
181
+ # state, where the test is queued to run but has not yet run.
182
+ #
183
+ # Tests must return one of four values: +:pass+, +:fail+, +:ignore+, or +nil+. Only structure
184
+ # tests may return +nil+, which indicates that the preconditions for the test have not yet been
185
+ # met. If a structure test returns +nil+, the test remains in a pending state and it will be run
186
+ # again when the node acquires a new ancestor.
187
+ #
188
+ # If a node test passes, the node is accepted into the parse tree. If it fails, the node is marked
189
+ # as erroneous and the particular cause of its failure is marked in the abstract syntax tree. If
190
+ # this tree is returned to the user, they will see this information. In addition to +:fail+, the rule
191
+ # may return more specific explanatory information:
192
+ #
193
+ # rule :word, /\w+/, tests: %i[we_want_foo!]
194
+ #
195
+ # def we_want_foo!(n)
196
+ # if n.text =~ /foo/
197
+ # :pass
198
+ # else
199
+ # [:fail, %Q[we really wanted to see "foo" but all we got was #{n.text.inspect}]]
200
+ # end
201
+ # end
202
+ #
203
+ # If a node returns +:pass+, the fact that the node passed the rule in question will be added to
204
+ # its +attributes+ hash in the AST.
205
+ #
206
+ # If a rule returns +:ignore+, this will constitute a pass, but no edits will be made to the AST.
207
+ #
208
+ # Tests short-circuit! If a node has many tests, they run until one fails.
209
+ #
210
+ # == Disadvantages of Tests
211
+ #
212
+ # All this being said, when tests <em>fail</em> they do so after their node has been built and added
213
+ # to a parse. This means their partially broken parse remains a candidate as Gullah tries to
214
+ # find the least bad way to parse the text it was given. This can be computationally expensive.
215
+ # If you can make do with preconditions (see above), they are the better choice.
216
+ #
217
+ # = Processors
218
+ #
219
+ # rule :word, /[a-z]+/i, process: :abbrv
220
+ # leaf :integer, /[1-9]\d*/, process: ->(n) { n.atts[:val] = n.text.to_i }
221
+ #
222
+ # def abbrv(node)
223
+ # node.attributes[:abbreviation] = node.text.gsub(/(?<!^)[aeiou]/, '')[0...5]
224
+ # end
225
+ #
226
+ # Any rule may have a +process+ named argument whose value is either a proc or a symbol.
227
+ # If it is a symbol, it must be the name of an instance method of the Gullah-fied class.
228
+ # In either case, the arity of the code in question must be one: its single argument will
229
+ # be the node created by the rule.
230
+ #
231
+ # The processing code may do anything -- log the event, provide a breakpoint -- but its
232
+ # expected use is to calculate and store some attribute of the node or its subtree in the
233
+ # node's attribute hash, most likely to accelerate other tests that will depend on this
234
+ # value. You may use this mechanism for other purposes, of course, to compile the text
235
+ # parsed into a more useful object, say, but because processing may occur on nodes which
236
+ # are later discarded in failed parses, it may be more efficient to defer such handling
237
+ # of the AST until the parse completes.
238
+ #
239
+ # Processors run after any tests have completed and only if they all pass.
240
+ #
241
+ # = Motivation
242
+ #
243
+ # Why does Gullah exist? Well, mostly because it seemed like fun to make it. I have made
244
+ # other grammar-adjacent things -- a recursive descent parser in Java inspired by the grammars
245
+ # of Raku, various regular expression optimization libraries in various languages, a simple
246
+ # grammar-esque regular expression enhancer for Rust that produces abstract syntax trees but
247
+ # can't handle recursion -- so I was thinking about the topic. A problem I faced with the recursive
248
+ # descent parser, which I later learned was a well-known problem, was infinite left-recursion.
249
+ # If you have a rule such as <tt>X -> X Y | Z</tt>, where an +X+ can be made of other +X+ es, your recursive
250
+ # descent parser constructs an infinitely long plan that never touches the data -- "I'll try an X, which
251
+ # means I'll first try an X, which means I'll first try an X..." The solution to this is to create an
252
+ # arbitrary, perhaps adjustable, recursion limit, recognize this pattern of recursion, and bail out
253
+ # when you find you've planned too long without executing anything. This is how I solved the problem in
254
+ # the library I wrote, but I found this unsatisfactory.
255
+ #
256
+ # An alternative solution, it occurred to me, was to start with the data rather than the plan. "I have
257
+ # an +X+. What can I make with this?" This instantly solves the left recursion problem, because the application
258
+ # of a rule must consume nodes, and it seems like a more
259
+ # reasonable way to parse things generally. As a latent linguist, this appealed to me as more psychologically
260
+ # realistic. Certainly people understand words in part by approaching language with expectations -- the top-down
261
+ # pattern you see in recursive descent -- but people are constantly confronted with text begun in the middle or
262
+ # interrupted or repaired mid-sentence, so they must be able as well to take the words they hear and try to
263
+ # make something from them. So I wanted to make a data-driven, bottom-up parser.
264
+ #
265
+ # (One thing I should say up front is that the design of Gullah is based entirely on my own pondering. I am not
266
+ # a very enthusiastic reader of other people's research. I am aware that a lot of work has been done on
267
+ # parsing and parser design, but the fun for me is in coming up with the ideas more than in doing the background
268
+ # reading, so I have just dived in. I am sure I have reinvented some wheels in this, mostly likely badly.)
269
+ #
270
+ # (Another aside: The left-recursion problem disappears with a bottom-up parser, which must consume data to proceed, but it
271
+ # is replaced with a unary-branching problem. If you have a rule that says an +A+ can be relabeled +B+ -- that
272
+ # is, you can add a node with a single child -- you risk an infinite loop. You may define rules such that +A+ becomes
273
+ # +B+, and another rule, or series of rules, which turns this +B+ back into an +A+. So this bottom-up parser has
274
+ # a somewhat unsatisfactory loop check as well.)
275
+ #
276
+ # A side benefit of bottom-up parsing is that it is robust against ill-formed data. If you can't make what you
277
+ # set out to make at least you can make something. And the structure you build out of the data can show very
278
+ # clearly where it has gone wrong. As a linguist, this appealed to my desire to model natural languages with
279
+ # all their noise and redundancy. As a programmer, this appealed to me as a way to make data problems
280
+ # transparent and solvable.
281
+ #
282
+ # = Efficiency
283
+ #
284
+ # I have taken care to make rules fail fast and have followed a dynamic programming model in which I cache
285
+ # information which would otherwise be recalculated in many recursions, but Gullah is certainly not as
286
+ # efficient as a parser custom designed for a particular language. A SAX parser of XML, for example, can
287
+ # process its input in linear time by pushing half-processed constructs onto a stack. The general mechanism
288
+ # underlying Gullah is worst-case quadratic, because events already seen may have to be scanned again to
289
+ # see whether recent decisions have changed whether they can be handled. If every node added to a
290
+ # provisional parse tree reduces the unprocessed node count by one and every scan on average finishes
291
+ # halfway through the unhandled nodes, this would mean n(n - 1)/2 comparisons to complete the tree. I doubt,
292
+ # though I cannot prove, that one could improve on this while maintaining one's parser's ability to handle
293
+ # broken data or ambiguous grammars. Ranking rules to try next based on past experience in the tree
294
+ # might improve the speed of parse discovery, but at the cost of greater complexity in the handling of any
295
+ # single scan.
296
+ #
297
+ # So if you have a particular data format or language you want to handle efficiently and you expect in most
298
+ # cases you will succeed without ambiguity on a single pass, Gullah is not the tool you want. But if you
299
+ # want to recover gracefully, it may be that a second pass with Gullah to produce the least bad parse and
300
+ # some information about how things went wrong is useful.
301
+
302
+ module Gullah
303
+ ##
304
+ # Define a tree structure rule. This specifies how tree nodes may be grouped under
305
+ # another node. The required arguments are +name+ and +body+. The former is a label
306
+ # for the node under which the others are grouped. The latter is a string defining
307
+ # the rule.
308
+ #
309
+ # rule :sequence, 'this then this'
310
+ #
311
+ # rule :quantifiers, 'foo bar? baz* plugh+ qux{2} quux{3,} corge{4,5}'
312
+ #
313
+ # rule :alternates, 'this | or | that'
314
+ # # you may also add alternates like so
315
+ # rule :alternates, 'also | these | and | those'
316
+ # rule :alternates, 'etc'
317
+ #
318
+ # rule :literals, %['this' "that"]
319
+ #
320
+ # rule :escapes, 'foo\\? "bar\\""'
321
+ #
322
+ # # the optional named arguments:
323
+ #
324
+ # rule :process, 'aha', process: ->(n) { log "Aha! we just matched #{n.text}!" }
325
+ # rule :or_maybe, 'oho', process: :some_arity_one_method_in_class_extending_gullah
326
+ #
327
+ # rule :tests, 'test me', tests: %i[node structure]
328
+ def rule(name, body, tests: [], preconditions: [], process: nil)
329
+ raise Error, 'tests must be an array' unless tests.is_a? Array
330
+ raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
331
+
332
+ init
333
+ init_check(name)
334
+ name = name.to_sym
335
+ body = body.to_s.strip.gsub(/\s+/, ' ')
336
+ return if dup_check(:rule, name, body, tests + preconditions)
337
+
338
+ tests << [process] if process
339
+ r = Rule.new name, body, tests: tests, preconditions: preconditions
340
+ subrules = r.subrules || [r]
341
+ subrules.each do |sr|
342
+ @rules << sr
343
+ sr.starters.each do |r, n|
344
+ (@starters[r] ||= []) << n
345
+ end
346
+ end
347
+ r.literals.each do |sym|
348
+ leaf sym.to_s, Regexp.new(quotemeta(sym.to_s))
349
+ end
350
+ end
351
+
352
+ ##
353
+ # Don't make whitespace automatically ignorable.
354
+ #
355
+ # class Foo
356
+ # extend Gullah
357
+ #
358
+ # keep_whitespace
359
+ #
360
+ # rule :a, 'a+'
361
+ # leaf :a, /a/
362
+ # end
363
+ #
364
+ # Foo.parse "aaa aaa"
365
+ #
366
+ # In this example, the parse tree would consist of two a nodes, each parent to three 'a' leaves,
367
+ # separated by a "trash" node corresponding to the whitespace, for which no leaf rule was provided.
368
+ def keep_whitespace
369
+ @keep_whitespace = true
370
+ end
371
+
372
+ ##
373
+ # A tokenization rule to divide the raw text into tokens to by matched by rules.
374
+ #
375
+ # The required arguments are a name and a regular expression. The name is what other
376
+ # rules will refer to. The regular expression of course defines the character sequence
377
+ # the rule matches. The more precise the regular expression the fewer false possibilities
378
+ # Gullah will have to sort through to find the best parse(s). Boundary markers in
379
+ # particular, +\b+ or lookarounds such as <tt>(?<!\d)</tt>, are helpful in this regard.
380
+ #
381
+ # The optional arguments are +tests+ and +process+. See +rule+ for more regarding these.
382
+ #
383
+ # leaf :word, /\b\w+\b/
384
+ # leaf :integer, /(?<!\d)[1-9]\d*(?!=\d)/, process: ->(n) { n.atts[:val] = n.text.to_i }
385
+ # leaf :name, /Bob/, tests: [:not_bobbing]
386
+ #
387
+ # def not_bobbing(n)
388
+ # /bing/.match(n.full_text, n.end) ? :fail : :pass
389
+ # end
390
+ def leaf(name, rx, tests: [], preconditions: [], process: nil)
391
+ _leaf name, rx, ignorable: false, tests: tests, process: process, preconditions: preconditions
392
+ end
393
+
394
+ ##
395
+ # A tokenization rule like +leaf+, but whose tokens are invisible to other rules.
396
+ # The +ignore+ method is otherwise identical to +leaf+.
397
+ #
398
+ # Unless +keep_whitespace+ is called, an +ignore+ rule covering whitespace will be
399
+ # generated automatically. It's name will be "_ws", or, if that is taken, "_wsN", where
400
+ # N is an integer sufficient to make this name unique among the rules of the grammar.
401
+ def ignore(name, rx, tests: [], preconditions: [], process: nil)
402
+ _leaf name, rx, ignorable: true, tests: tests, process: process, preconditions: []
403
+ end
404
+
405
+ ##
406
+ # A tokenization rule like +leaf+, but whose tokens cannot be the children of other nodes.
407
+ # The +ignore+ method is otherwise identical to +leaf+.
408
+ #
409
+ # Boundaries are extremely valuable for reducing the complexity of parsing, because Gullah
410
+ # knows no parse can span a boundary. Trash nodes -- nodes that correspond to character
411
+ # sequences unmatched by any leaf rule -- are also boundaries, though most likely erroneous
412
+ # ones.
413
+ #
414
+ # # clause boundary pattern
415
+ # boundary :terminal, /[.!?](?=\s*\z|\s+"?\p{Lu})|[:;]/
416
+ def boundary(name, rx, tests: [], preconditions: [], process: nil)
417
+ _leaf name, rx, boundary: true, tests: tests, preconditions: preconditions, process: process
418
+ end
419
+
420
+ ##
421
+ # Obtain the set of optimal parses of the given text. Optimality is determined
422
+ # by four criteria. In every case the smaller the number the better.
423
+ #
424
+ # correctness:: The count of node or structure tests that have failed.
425
+ # completion:: The count of root nodes.
426
+ # pending:: The count of structure tests that were not applied.
427
+ # size:: The total number of nodes.
428
+ #
429
+ # You can adjust the optimality conditions only by removing them via the optional
430
+ # +filters+ argument. If you supply this argument, only the optimality criteria you
431
+ # specify will be applied. The order of application is fixed: if parse A is more
432
+ # correct than parse B, it will be kept and B discarded even if B is more complete,
433
+ # has fewer pending tests, and fewer nodes.
434
+ #
435
+ # The optional +n+ parameter can be used to specify the desired number of parses.
436
+ # This is useful if your parse rules are ambiguous. For example, consider the grammar
437
+ #
438
+ # class Binary
439
+ # extend Gullah
440
+ # rule :a, 'a{2}'
441
+ # leaf :a, /\S+/
442
+ # end
443
+ #
444
+ # If you ask this to parse the string "a b c d e f g h i j k l" it will produce
445
+ # 58,786 equally good parses. These will consume a lot of memory and producing them
446
+ # will consume a lot of time. The +n+ parameter will let you get on with things faster.
447
+ #
448
+ # A caveat: Because of the way Gullah works you may not get exactly +n+ parses
449
+ # back when you ask for +n+. There may not be sufficiently many parses, of course, but
450
+ # you may also get back more than +n+ parses if the text you are parsing contains
451
+ # parsing boundaries. Gullah parses the portions of text inside the boundaries separately,
452
+ # so the number of possible parses will be the product of the number of parses of
453
+ # each bounded segment. If you have a sentence boundary in the middle of your text,
454
+ # and thus two segments, the number of parses of the entire text will be the number
455
+ # of parses of the first segment times the number of parses of the second. If the first
456
+ # has two parses and the second also has two but you ask for 3, the number of parses
457
+ # Gullah will find as it goes will be 1, then 2, then 4. There is no iteration of the
458
+ # process in which Gullah has found exactly 3 parses. The 4 it has found are necessarily
459
+ # all equally good, so rather than arbitrarily choosing 3 and discarding one, Gullah
460
+ # will return all 4.
461
+ def parse(text, filters: %i[correctness completion pending size], n: nil)
462
+ raise Error, 'n must be positive' if n&.zero?
463
+
464
+ commit
465
+ segments = segment(text.to_s, filters, n)
466
+ initial_segments = segments.select { |s| s.start.zero? }
467
+ if n
468
+ # iterate till all segments done or we get >= n parses
469
+ # another place to start parallelization
470
+ while (s = segments.reject(&:done).min_by(&:weight))
471
+ break if s.next && initial_segments.sum(&:total_parses) >= n
472
+ end
473
+ else
474
+ # iterate till all segments done
475
+ # NOTE: could be parallelized
476
+ while (s = segments.find { |s| !s.done })
477
+ s.next
478
+ end
479
+ end
480
+ if segments.length > 1
481
+ # pass the results through a new hopper to filter out duds
482
+ hopper = Hopper.new filters, nil
483
+ initial_segments.flat_map(&:results).each { |p| hopper << p }
484
+ hopper.dump.each(&:initialize_summaries)
485
+ else
486
+ segments.first.results
487
+ end
488
+ end
489
+
490
+ ##
491
+ # The first parse found. This takes the same arguments as +parse+ minus +n+.
492
+ # If there are no parses without errors or unsatisfied pending tree structure
493
+ # tests, it will be the first erroneous or incomplete parse.
494
+ #
495
+ # If you expect the parse to succeed and be unambiguous, this is the method you
496
+ # want.
497
+ def first(text, filters: %i[correctness completion pending size])
498
+ parse(text, filters: filters, n: 1).first
499
+ end
500
+
501
+ # :stopdoc:
502
+
503
+ private
504
+
505
+ def init
506
+ return if iv_check :@rules
507
+
508
+ @rules = []
509
+ @leaves = []
510
+ @starters = {}
511
+ @tests = {}
512
+ @preconditions = {}
513
+ @committed = false
514
+ @do_unary_branch_check = nil
515
+ end
516
+
517
+ def iv_check(var)
518
+ v = instance_variable_defined?(var) && instance_variable_get(var)
519
+ v && block_given? ? yield(v) : v
520
+ end
521
+
522
+ # do some sanity checking, initialization, and optimization
523
+ def commit
524
+ return if iv_check(:@committed)
525
+ raise Error, "#{name} has no leaves" unless iv_check(:@leaves, &:any?)
526
+
527
+ # add the whitespace rule unless told otherwise
528
+ if iv_check(:@keep_whitespace)
529
+ remove_instance_variable :@keep_whitespace
530
+ else
531
+ used_rules = (@rules.map(&:name) + @leaves.map(&:name)).to_set
532
+ base = '_ws'
533
+ count = nil
534
+ count = count.to_i + 1 while used_rules.include? "#{base}#{count}".to_sym
535
+ _leaf "#{base}#{count}".to_sym, /\s+/, ignorable: true
536
+ end
537
+
538
+ # vet on commit so rule definition is order-independent
539
+ [@leaves, @rules].flatten.each do |r|
540
+ vetted_tests = r.tests.map { |t| vet t }
541
+ vetted_preconds = r.preconditions.map { |pc| vet_precondition pc }
542
+ r._post_init(vetted_tests, vetted_preconds)
543
+ end
544
+ completeness_check
545
+ loop_check
546
+ # arrange things so we first try rules that can complete more of the parse;
547
+ # better would be sorting by frequency in parse trees, but we don't have
548
+ # that information
549
+ @starters.transform_values { |atoms| atoms.sort_by(&:max_consumption).reverse }
550
+ remove_instance_variable :@leaf_dup_check if iv_check(:@leaf_dup_check)
551
+ remove_instance_variable :@rule_dup_check if iv_check(:@rule_dup_check)
552
+ @committed = true
553
+ end
554
+
555
+ # has every rule/leaf required by some rule been defined?
556
+ def completeness_check
557
+ available = (@rules + @leaves).map(&:name).to_set
558
+ sought = @rules.flat_map(&:seeking).uniq.to_set
559
+ problems = sought.reject { |s| available.include? s }
560
+ raise Error, "the following rules or leaves remain undefined: #{problems.join(', ')}" if problems.any?
561
+ end
562
+
563
+ # define the @do_unary_branch_check variable
564
+ def loop_check
565
+ @do_unary_branch_check = false
566
+ links = @rules.select(&:potentially_unary?).flat_map(&:branches).uniq
567
+ if links.any?
568
+ potential_loops = links.map { |l| LoopCheck.new l }
569
+ catch :looped do
570
+ while potential_loops.any?
571
+ new_potential_loops = []
572
+ links.each do |l|
573
+ potential_loops.each do |pl|
574
+ if (npl = pl.add(l, self))
575
+ new_potential_loops << npl
576
+ end
577
+ end
578
+ end
579
+ potential_loops = new_potential_loops
580
+ end
581
+ end
582
+ end
583
+ end
584
+
585
+ class LoopCheck
586
+ def initialize(link)
587
+ @seen = Set.new(link)
588
+ @seeking = link.last
589
+ end
590
+
591
+ def add(link, grammar)
592
+ if @seeking == link.first
593
+ if @seen.include? link.last
594
+ grammar.instance_variable_set :@do_unary_branch_check, true
595
+ throw :looped
596
+ end
597
+ LoopCheck.new(@seen.to_a + [link.last])
598
+ end
599
+ end
600
+ end
601
+
602
+ def init_check(name)
603
+ raise Error, "cannot define #{name}; all rules must be defined before parsing" if @committed
604
+ end
605
+
606
+ # a tokenization rule to divide the raw text into tokens and separators ("ignorable" tokens)
607
+ def _leaf(name, rx, ignorable: false, boundary: false, tests: [], preconditions: [], process: nil)
608
+ raise Error, 'tests must be an array' unless tests.is_a? Array
609
+ raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
610
+
611
+ init
612
+ init_check(name)
613
+ name = name.to_sym
614
+ return if dup_check(:leaf, name, rx, tests + preconditions)
615
+
616
+ tests << [process] if process
617
+ @leaves << Leaf.new(name, rx, ignorable: ignorable, boundary: boundary, tests: tests, preconditions: preconditions)
618
+ end
619
+
620
+ # convert raw text into one or more arrays of leaf nodes -- maximally unreduced parses
621
+ def lex(text)
622
+ bases = [[0, Parse.new(text)]]
623
+ done = []
624
+ while bases.any?
625
+ offset, parse = bases.shift
626
+ added_any = false
627
+ @leaves.each do |leaf|
628
+ # can this leaf rule extract a leaf at this offset?
629
+ next unless (md = leaf.rx.match(text, offset)) && md.begin(0) == offset
630
+
631
+ e = md.end(0)
632
+ next if leaf.preconditions.any? { |pc| pc.call(leaf.name, offset, e, text, []) == :fail }
633
+
634
+ added_any = true
635
+ new_parse = parse.add(offset, e, leaf, @do_unary_branch_check, false, leaf.boundary)
636
+ if e == text.length
637
+ done << new_parse
638
+ else
639
+ bases << [e, new_parse]
640
+ end
641
+ end
642
+ next if added_any
643
+
644
+ # try to eliminate trash
645
+ trash_offset = text.length
646
+ @leaves.each do |leaf|
647
+ # is there a leaf like this closer to the current offset?
648
+ next unless
649
+ (md = leaf.rx.match(text, offset)) &&
650
+ (b = md.begin(0)) &&
651
+ (b < trash_offset) &&
652
+ (e = md.end(0)) &&
653
+ leaf.preconditions.none? { |pc| pc.call(leaf.name, b, e, text, []) == :fail }
654
+
655
+ trash_offset = b
656
+ end
657
+ new_parse = parse.add(offset, trash_offset, trash_rule, false, true)
658
+ if trash_offset == text.length
659
+ done << new_parse
660
+ else
661
+ bases << [trash_offset, new_parse]
662
+ end
663
+ end
664
+ done # an array of Parses
665
+ end
666
+
667
+ # slice text into independent segments
668
+ def segment(text, filters, n)
669
+ uncollected_segments = lex(text).flat_map(&:split)
670
+ segments = uncollected_segments.group_by { |s| [s.start, s.end] }.values.map do |segs|
671
+ Segment.new segs, filters, @starters, @do_unary_branch_check, n
672
+ end
673
+ segments.group_by(&:end).each do |final_offset, segs|
674
+ continuations = segments.select { |s| s.start == final_offset }
675
+ segs.each { |s| s.continuations = continuations }
676
+ end
677
+ segments
678
+ end
679
+
680
+ def trash_rule
681
+ @trash_rule ||= Leaf.new(:"", nil)
682
+ end
683
+
684
+ def singleton
685
+ @singleton ||= new
686
+ end
687
+
688
+ # check for duplicate rule/leaf
689
+ # return true if perfect duplicate, false if novel
690
+ def dup_check(type, name, body, tests)
691
+ set = type == :leaf ? (@leaf_dup_check ||= ::Set.new) : (@rule_dup_check ||= ::Set.new)
692
+ key = [name, body, tests.sort]
693
+ if set.include? key
694
+ true
695
+ else
696
+ set << key
697
+ false
698
+ end
699
+ end
700
+
701
+ # vet tests
702
+ def vet(test)
703
+ if test.is_a? Array
704
+ # this is a processing function, not a real test
705
+ return procify(test.first)
706
+ end
707
+
708
+ @tests[test] ||= begin
709
+ begin
710
+ m = singleton.method(test)
711
+ rescue ::NameError
712
+ raise Error, "#{test} is not defined"
713
+ end
714
+ raise Error, "#{test} must take either one or two arguments" unless (1..2).include? m.arity
715
+
716
+ m
717
+ end
718
+ end
719
+
720
+ # vet preconditions
721
+ def vet_precondition(precond)
722
+ @preconditions[precond] ||= begin
723
+ begin
724
+ m = singleton.method(precond)
725
+ rescue ::NameError
726
+ raise Error, "#{precond} is not defined"
727
+ end
728
+ raise Error, <<-MESSAGE.strip.gsub(/\s+/, ' ') unless m.arity == 5
729
+ #{precond} must take four arguments:
730
+ the rule or leaf name,
731
+ the start character offset,
732
+ the end character offset,
733
+ the text being parsed,
734
+ and the prospective children
735
+ MESSAGE
736
+
737
+ m
738
+ end
739
+ end
740
+
741
+ # escape a string literal for use in a regex
742
+ def quotemeta(str)
743
+ quoted = ''
744
+ (0...str.length).each do |i|
745
+ c = str[i]
746
+ quoted += '\\' if c =~ /[{}()\[\].?+*\\^$]/
747
+ quoted += c
748
+ end
749
+ quoted
750
+ end
751
+
752
+ def procify(processor)
753
+ case processor
754
+ when Symbol
755
+ @tests[processor] ||= begin
756
+ begin
757
+ m = singleton.method(processor)
758
+ rescue ::NameError
759
+ raise Error, "#{processor} is not defined"
760
+ end
761
+ raise Error, "#{processor} can only take a single argument" unless m.arity == 1
762
+
763
+ lambda { |n|
764
+ m.call(n) unless n.error?
765
+ return :ignore
766
+ }
767
+ end
768
+ when Proc
769
+ lambda { |n|
770
+ processor.call(n) unless n.error?
771
+ return :ignore
772
+ }
773
+ else
774
+ raise Error, 'a node processor can only be a proc or a symbol'
775
+ end
776
+ end
777
+ end