gullah 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/LICENSE +21 -0
- data/README.md +87 -0
- data/Rakefile +11 -0
- data/TODO.md +2 -0
- data/examples/hat.rb +27 -0
- data/examples/trash.rb +42 -0
- data/examples/xml.rb +45 -0
- data/gullah.gemspec +31 -0
- data/lib/gullah/atom.rb +132 -0
- data/lib/gullah/boundary.rb +11 -0
- data/lib/gullah/dotifier.rb +127 -0
- data/lib/gullah/error.rb +7 -0
- data/lib/gullah/hopper.rb +142 -0
- data/lib/gullah/iterator.rb +67 -0
- data/lib/gullah/leaf.rb +24 -0
- data/lib/gullah/node.rb +553 -0
- data/lib/gullah/parse.rb +233 -0
- data/lib/gullah/picker.rb +56 -0
- data/lib/gullah/rule.rb +90 -0
- data/lib/gullah/segment.rb +92 -0
- data/lib/gullah/trash.rb +15 -0
- data/lib/gullah/version.rb +7 -0
- data/lib/gullah.rb +777 -0
- data/test/basic_test.rb +451 -0
- data/test/big_tree_test.rb +26 -0
- data/test/boundary_test.rb +29 -0
- data/test/date_test.rb +111 -0
- data/test/error_test.rb +245 -0
- data/test/json_test.rb +124 -0
- data/test/parse_demo_test.rb +33 -0
- data/test/precondition_test.rb +68 -0
- data/test/tests_per_subrule_test.rb +49 -0
- data/test/tree_walking_test.rb +88 -0
- metadata +157 -0
data/lib/gullah/trash.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# a node just for trash
|
5
|
+
class Trash < Node # :nodoc:
|
6
|
+
# does this node represent a character sequence no leaf rule matched?
|
7
|
+
def trash?
|
8
|
+
true
|
9
|
+
end
|
10
|
+
|
11
|
+
def boundary?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/gullah.rb
ADDED
@@ -0,0 +1,777 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
%w[version atom error hopper leaf node trash boundary parse rule iterator dotifier segment picker].each do |s|
|
5
|
+
require "gullah/#{s}"
|
6
|
+
end
|
7
|
+
|
8
|
+
# A collection of class methods that can be added into a class to make it parser.
|
9
|
+
# For example:
|
10
|
+
#
|
11
|
+
# class Foo
|
12
|
+
# extend Gullah
|
13
|
+
#
|
14
|
+
# rule :plugh, 'foo bar+ | bar foo{1,3}'
|
15
|
+
# rule :foo, 'number word'
|
16
|
+
# rule :bar, 'punctuation "wow!"'
|
17
|
+
# leaf :word, /[a-z]+/i
|
18
|
+
# leaf :number, /\d+(?:\.\d+)?/
|
19
|
+
# leaf :punctuation, /[^\w\s]+/
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# Having defined a grammar like this, one can apply it to arbitrary strings to
|
23
|
+
# generate parse trees:
|
24
|
+
#
|
25
|
+
# Foo.parse "123 cat @#$ wow! ___wow!"
|
26
|
+
#
|
27
|
+
# Gullah can produce parse trees from incomplete or ambiguous grammars. It can handle
|
28
|
+
# noisy data. One can apply arbitrary tests to parse nodes, including tests that
|
29
|
+
# depend on other nodes in the parse tree. In the case of test failure the nature
|
30
|
+
# of the failure is marked on the corresponding nodes in the parse tree.
|
31
|
+
#
|
32
|
+
# = Syntax
|
33
|
+
#
|
34
|
+
# This section describes only the syntax of Gullah rules, not the entire API. Gullah
|
35
|
+
# syntax is generally the more familiar subset of the rules of regular expressions.
|
36
|
+
#
|
37
|
+
# - <b>sequence</b>
|
38
|
+
#
|
39
|
+
# rule :foo, 'bar baz' # one thing follows another
|
40
|
+
#
|
41
|
+
# - <b>alternation</b>
|
42
|
+
#
|
43
|
+
# rule :foo, 'bar | baz' # separate alternates with pipes
|
44
|
+
# rule :foo, 'plugh+' # or simply define it additional times (not regex grammar)
|
45
|
+
#
|
46
|
+
# Note, you can define all alternates by simple redefinition as in the second line
|
47
|
+
# above. You can use the pipe syntax for convenience. Any tests or preconditions
|
48
|
+
# provided with a particular definition of the rule <em>will apply only for that definition</em>.
|
49
|
+
#
|
50
|
+
# - <b>repetition</b>
|
51
|
+
#
|
52
|
+
# rule :option, 'foo?' # ? means "one or none"
|
53
|
+
# rule :plural, 'foo+' # + means "one or more"
|
54
|
+
# rule :options, 'foo*' # * means "zero or more"
|
55
|
+
# rule :n, 'foo{2}' # {n} means "exactly n"
|
56
|
+
# rule :n_plus, 'foo{2,}' # {n,} means "n or more"
|
57
|
+
# rule :n_m, 'foo{2,3}' # {n,m} means "between n and m"
|
58
|
+
#
|
59
|
+
# Note, though you can define rules like +option+ and +options+, a rule can't add
|
60
|
+
# a node to the parse tree if it matches nothing. These repetition suffixes are
|
61
|
+
# are more useful as part of a sequence. In practice <tt>foo?</tt> will be a less
|
62
|
+
# efficient version of <tt>foo</tt>, and <tt>foo*</tt>, a less efficient version of
|
63
|
+
# <tt>foo+</tt>.
|
64
|
+
#
|
65
|
+
# - <b>literals</b>
|
66
|
+
#
|
67
|
+
# rule :foo, '"(" bar ")"'
|
68
|
+
#
|
69
|
+
# Literals allow you to avoid defining simple leaf rules. The above is basically
|
70
|
+
# shorthand for
|
71
|
+
#
|
72
|
+
# rule :foo, 'left_paren bar right_paren'
|
73
|
+
# leaf :left_paren, /\(/
|
74
|
+
# leaf :right_paren, /\)/
|
75
|
+
#
|
76
|
+
# You may use either single or double quotes to define literals. You may also use
|
77
|
+
# escape sequences to include random characters in literals. Literals may have
|
78
|
+
# repetition suffixes.
|
79
|
+
#
|
80
|
+
# - <b>grouping</b>
|
81
|
+
#
|
82
|
+
# rule :foo, 'bar baz'
|
83
|
+
#
|
84
|
+
# Surprise! There is no grouping syntax in Gullah. Every rule is in effect a named group.
|
85
|
+
# So it might be better said that there are no anonymous groups in Gullah and grouping
|
86
|
+
# doesn't involve parentheses.
|
87
|
+
#
|
88
|
+
# You may be wondering about whitespace handling. See +ignore+ and +keep_whitespace+ below.
|
89
|
+
# The short version of it is that Gullah creates an ignorable whitespace leaf rule by
|
90
|
+
# default.
|
91
|
+
#
|
92
|
+
# = Preconditions
|
93
|
+
#
|
94
|
+
# The first step in adding a node to a parse tree is collecting a sequence of child
|
95
|
+
# nodes that match some rule. If the rule is
|
96
|
+
#
|
97
|
+
# rule :foo, 'bar+'
|
98
|
+
#
|
99
|
+
# you've collected a sequence of +bar+ nodes. If there is some condition you need this
|
100
|
+
# node to respect <em>which is dependent only on the rule and the child nodes</em> which you
|
101
|
+
# can't express, or not easily, in the rule itself, you can define one or more
|
102
|
+
# preconditions. E.g.,
|
103
|
+
#
|
104
|
+
# rule :foo, 'bar+', preconditions: %i[fibonacci]
|
105
|
+
#
|
106
|
+
# def fibonacci(_name, _start, _end, _text, children)
|
107
|
+
# is_fibonacci_number? children.length # assumes we've defined is_fibonacci_number?
|
108
|
+
# end
|
109
|
+
#
|
110
|
+
# A precondition is just an instance method defined in the Gullah-fied class with an arity
|
111
|
+
# of two: it takes the rule's name, a symbol, as its first argument, the start and end character
|
112
|
+
# offsets of the match as its second and third arguments, the text being parsed as its fourth argument,
|
113
|
+
# and the prospective child nodes, an array, as its last argument. If it returns a truthy value, the
|
114
|
+
# precondition holds and the node can be made. Otherwise, Gullah tries the next thing.
|
115
|
+
#
|
116
|
+
# == Preconditions versus Tests
|
117
|
+
#
|
118
|
+
# Preconditions are like tests (see below). They are further conditions on the building of
|
119
|
+
# nodes in a parse tree. Why does Gullah provide both? There are several reasons:
|
120
|
+
#
|
121
|
+
# - Preconditions are tested before the node is built, avoiding the overhead of cloning
|
122
|
+
# nodes, so they are considerably lighter weight.
|
123
|
+
# - Because they are tested <em>before</em> the node is built, they result in no partially erroneous
|
124
|
+
# parse in the event of failure, so they leave nothing Gullah will attempt to improve further
|
125
|
+
# at the cost of time.
|
126
|
+
# - But they don't leave a trace, so there's nothing to examine in the event of failure.
|
127
|
+
# - And they concern only the subtree rooted at the prospective node, so they cannot express
|
128
|
+
# structural relationships between this node and nodes which do not descend from it.
|
129
|
+
#
|
130
|
+
# <b>Note</b>: preconditions cannot tests relationships between <em>nodes</em> outside the prospective node's
|
131
|
+
# subtree, but they can test its relationships to adjoining <em>characters</em>, so they can
|
132
|
+
# implement lookarounds. For instance:
|
133
|
+
#
|
134
|
+
# def colon_after(_rule_or_leaf_name, _start_offset, end_offset, text, _children)
|
135
|
+
# text[end_offset..-1] =~ /\A\s*:/ # equivalent to (?=\s*:)
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
# = Tests
|
139
|
+
#
|
140
|
+
# rule :all, 'baz+', tests: %i[half_even]
|
141
|
+
# rule :baz, 'foo | bar'
|
142
|
+
#
|
143
|
+
# leaf :foo, /\d+/, tests: %i[xenophilia]
|
144
|
+
# leaf :bar, /[a-z]/i, tests: %i[xenophilia]
|
145
|
+
#
|
146
|
+
# # node test!
|
147
|
+
#
|
148
|
+
# # half the digit characters under this node must be even, half, odd
|
149
|
+
# def half_even(node)
|
150
|
+
# even, odd = node.text.chars.select { |c| c =~ /\d/ }.partition { |c| c.to_i.even? }
|
151
|
+
# even.length == odd.length ? :pass : :fail
|
152
|
+
# end
|
153
|
+
#
|
154
|
+
# # structure test!
|
155
|
+
#
|
156
|
+
# # foos need bars and bars need foos
|
157
|
+
# def xenophilia(root, node)
|
158
|
+
# if root.name == :all
|
159
|
+
# sought = node.name == :foo ? :bar : :foo
|
160
|
+
# root.descendants.any? { |n| n.name == sought } ? :pass : :fail
|
161
|
+
# end
|
162
|
+
# end
|
163
|
+
#
|
164
|
+
# A special feature of Gullah is that you can add arbitrary tests to its rules. For example
|
165
|
+
# you can use a simple regular expression to match a date and then a test to do a sanity
|
166
|
+
# check to confirm that the parts of the date, the year, month, and day, combine to produce
|
167
|
+
# a real date on the calendar. This is better than simply writing a thorough regular expression
|
168
|
+
# because it gives you the opportunity to tell the user *how* a match failed rather than simply
|
169
|
+
# that it failed. This feature is Gullah's answer to such things as lookarounds and back
|
170
|
+
# references: you've matched a simple pattern; now does this pattern fit sanely with its context?
|
171
|
+
#
|
172
|
+
# There are two sorts of tests: node tests and structure tests. Node tests are tests that need
|
173
|
+
# only the node itself and its subtree as inputs. Structure tests are tests that depend on
|
174
|
+
# elements of the parse tree outside of the subtree rooted at the node itself.
|
175
|
+
#
|
176
|
+
# Tests are implemented as instance methods of the Gullah-fied class. If the method has an arity
|
177
|
+
# of one, it is a node test. Its single argument is the node matched. If it has an arity of two,
|
178
|
+
# it is a structure test. The first argument is an ancestor node of the node corresponding to the
|
179
|
+
# rule. The second argument is the node itself. Because structure tests cannot be run until the
|
180
|
+
# node has some ancestor, and then they might not apply to all ancestors, they can be in a "pending"
|
181
|
+
# state, where the test is queued to run but has not yet run.
|
182
|
+
#
|
183
|
+
# Tests must return one of four values: +:pass+, +:fail+, +:ignore+, or +nil+. Only structure
|
184
|
+
# tests may return +nil+, which indicates that the preconditions for the test have not yet been
|
185
|
+
# met. If a structure test returns +nil+, the test remains in a pending state and it will be run
|
186
|
+
# again when the node acquires a new ancestor.
|
187
|
+
#
|
188
|
+
# If a node test passes, the node is accepted into the parse tree. If it fails, the node is marked
|
189
|
+
# as erroneous and the particular cause of its failure is marked in the abstract syntax tree. If
|
190
|
+
# this tree is returned to the user, they will see this information. In addition to +:fail+, the rule
|
191
|
+
# may return more specific explanatory information:
|
192
|
+
#
|
193
|
+
# rule :word, /\w+/, tests: %i[we_want_foo!]
|
194
|
+
#
|
195
|
+
# def we_want_foo!(n)
|
196
|
+
# if n.text =~ /foo/
|
197
|
+
# :pass
|
198
|
+
# else
|
199
|
+
# [:fail, %Q[we really wanted to see "foo" but all we got was #{n.text.inspect}]]
|
200
|
+
# end
|
201
|
+
# end
|
202
|
+
#
|
203
|
+
# If a node returns +:pass+, the fact that the node passed the rule in question will be added to
|
204
|
+
# its +attributes+ hash in the AST.
|
205
|
+
#
|
206
|
+
# If a rule returns +:ignore+, this will constitute a pass, but no edits will be made to the AST.
|
207
|
+
#
|
208
|
+
# Tests short-circuit! If a node has many tests, they run until one fails.
|
209
|
+
#
|
210
|
+
# == Disadvantages of Tests
|
211
|
+
#
|
212
|
+
# All this being said, when tests <em>fail</em> they do so after their node has been built and added
|
213
|
+
# to a parse. This means their partially broken parse remains a candidate as Gullah tries to
|
214
|
+
# find the least bad way to parse the text it was given. This can be computationally expensive.
|
215
|
+
# If you can make do with preconditions (see above), they are the better choice.
|
216
|
+
#
|
217
|
+
# = Processors
|
218
|
+
#
|
219
|
+
# rule :word, /[a-z]+/i, process: :abbrv
|
220
|
+
# leaf :integer, /[1-9]\d*/, process: ->(n) { n.atts[:val] = n.text.to_i }
|
221
|
+
#
|
222
|
+
# def abbrv(node)
|
223
|
+
# node.attributes[:abbreviation] = node.text.gsub(/(?<!^)[aeiou]/, '')[0...5]
|
224
|
+
# end
|
225
|
+
#
|
226
|
+
# Any rule may have a +process+ named argument whose value is either a proc or a symbol.
|
227
|
+
# If it is a symbol, it must be the name of an instance method of the Gullah-fied class.
|
228
|
+
# In either case, the arity of the code in question must be one: its single argument will
|
229
|
+
# be the node created by the rule.
|
230
|
+
#
|
231
|
+
# The processing code may do anything -- log the event, provide a breakpoint -- but its
|
232
|
+
# expected use is to calculate and store some attribute of the node or its subtree in the
|
233
|
+
# node's attribute hash, most likely to accelerate other tests that will depend on this
|
234
|
+
# value. You may use this mechanism for other purposes, of course, to compile the text
|
235
|
+
# parsed into a more useful object, say, but because processing may occur on nodes which
|
236
|
+
# are later discarded in failed parses, it may be more efficient to defer such handling
|
237
|
+
# of the AST until the parse completes.
|
238
|
+
#
|
239
|
+
# Processors run after any tests have completed and only if they all pass.
|
240
|
+
#
|
241
|
+
# = Motivation
|
242
|
+
#
|
243
|
+
# Why does Gullah exist? Well, mostly because it seemed like fun to make it. I have made
|
244
|
+
# other grammar-adjacent things -- a recursive descent parser in Java inspired by the grammars
|
245
|
+
# of Raku, various regular expression optimization libraries in various languages, a simple
|
246
|
+
# grammar-esque regular expression enhancer for Rust that produces abstract syntax trees but
|
247
|
+
# can't handle recursion -- so I was thinking about the topic. A problem I faced with the recursive
|
248
|
+
# descent parser, which I later learned was a well-known problem, was infinite left-recursion.
|
249
|
+
# If you have a rule such as <tt>X -> X Y | Z</tt>, where an +X+ can be made of other +X+ es, your recursive
|
250
|
+
# descent parser constructs an infinitely long plan that never touches the data -- "I'll try an X, which
|
251
|
+
# means I'll first try an X, which means I'll first try an X..." The solution to this is to create an
|
252
|
+
# arbitrary, perhaps adjustable, recursion limit, recognize this pattern of recursion, and bail out
|
253
|
+
# when you find you've planned too long without executing anything. This is how I solved the problem in
|
254
|
+
# the library I wrote, but I found this unsatisfactory.
|
255
|
+
#
|
256
|
+
# An alternative solution, it occurred to me, was to start with the data rather than the plan. "I have
|
257
|
+
# an +X+. What can I make with this?" This instantly solves the left recursion problem, because the application
|
258
|
+
# of a rule must consume nodes, and it seems like a more
|
259
|
+
# reasonable way to parse things generally. As a latent linguist, this appealed to me as more psychologically
|
260
|
+
# realistic. Certainly people understand words in part by approaching language with expectations -- the top-down
|
261
|
+
# pattern you see in recursive descent -- but people are constantly confronted with text begun in the middle or
|
262
|
+
# interrupted or repaired mid-sentence, so they must be able as well to take the words they hear and try to
|
263
|
+
# make something from them. So I wanted to make a data-driven, bottom-up parser.
|
264
|
+
#
|
265
|
+
# (One thing I should say up front is that the design of Gullah is based entirely on my own pondering. I am not
|
266
|
+
# a very enthusiastic reader of other people's research. I am aware that a lot of work has been done on
|
267
|
+
# parsing and parser design, but the fun for me is in coming up with the ideas more than in doing the background
|
268
|
+
# reading, so I have just dived in. I am sure I have reinvented some wheels in this, mostly likely badly.)
|
269
|
+
#
|
270
|
+
# (Another aside: The left-recursion problem disappears with a bottom-up parser, which must consume data to proceed, but it
|
271
|
+
# is replaced with a unary-branching problem. If you have a rule that says an +A+ can be relabeled +B+ -- that
|
272
|
+
# is, you can add a node with a single child -- you risk an infinite loop. You may define rules such that +A+ becomes
|
273
|
+
# +B+, and another rule, or series of rules, which turns this +B+ back into an +A+. So this bottom-up parser has
|
274
|
+
# a somewhat unsatisfactory loop check as well.)
|
275
|
+
#
|
276
|
+
# A side benefit of bottom-up parsing is that it is robust against ill-formed data. If you can't make what you
|
277
|
+
# set out to make at least you can make something. And the structure you build out of the data can show very
|
278
|
+
# clearly where it has gone wrong. As a linguist, this appealed to my desire to model natural languages with
|
279
|
+
# all their noise and redundancy. As a programmer, this appealed to me as a way to make data problems
|
280
|
+
# transparent and solvable.
|
281
|
+
#
|
282
|
+
# = Efficiency
|
283
|
+
#
|
284
|
+
# I have taken care to make rules fail fast and have followed a dynamic programming model in which I cache
|
285
|
+
# information which would otherwise be recalculated in many recursions, but Gullah is certainly not as
|
286
|
+
# efficient as a parser custom designed for a particular language. A SAX parser of XML, for example, can
|
287
|
+
# process its input in linear time by pushing half-processed constructs onto a stack. The general mechanism
|
288
|
+
# underlying Gullah is worst-case quadratic, because events already seen may have to be scanned again to
|
289
|
+
# see whether recent decisions have changed whether they can be handled. If every node added to a
|
290
|
+
# provisional parse tree reduces the unprocessed node count by one and every scan on average finishes
|
291
|
+
# halfway through the unhandled nodes, this would mean n(n - 1)/2 comparisons to complete the tree. I doubt,
|
292
|
+
# though I cannot prove, that one could improve on this while maintaining one's parser's ability to handle
|
293
|
+
# broken data or ambiguous grammars. Ranking rules to try next based on past experience in the tree
|
294
|
+
# might improve the speed of parse discovery, but at the cost of greater complexity in the handling of any
|
295
|
+
# single scan.
|
296
|
+
#
|
297
|
+
# So if you have a particular data format or language you want to handle efficiently and you expect in most
|
298
|
+
# cases you will succeed without ambiguity on a single pass, Gullah is not the tool you want. But if you
|
299
|
+
# want to recover gracefully, it may be that a second pass with Gullah to produce the least bad parse and
|
300
|
+
# some information about how things went wrong is useful.
|
301
|
+
|
302
|
+
module Gullah
|
303
|
+
##
|
304
|
+
# Define a tree structure rule. This specifies how tree nodes may be grouped under
|
305
|
+
# another node. The required arguments are +name+ and +body+. The former is a label
|
306
|
+
# for the node under which the others are grouped. The latter is a string defining
|
307
|
+
# the rule.
|
308
|
+
#
|
309
|
+
# rule :sequence, 'this then this'
|
310
|
+
#
|
311
|
+
# rule :quantifiers, 'foo bar? baz* plugh+ qux{2} quux{3,} corge{4,5}'
|
312
|
+
#
|
313
|
+
# rule :alternates, 'this | or | that'
|
314
|
+
# # you may also add alternates like so
|
315
|
+
# rule :alternates, 'also | these | and | those'
|
316
|
+
# rule :alternates, 'etc'
|
317
|
+
#
|
318
|
+
# rule :literals, %['this' "that"]
|
319
|
+
#
|
320
|
+
# rule :escapes, 'foo\\? "bar\\""'
|
321
|
+
#
|
322
|
+
# # the optional named arguments:
|
323
|
+
#
|
324
|
+
# rule :process, 'aha', process: ->(n) { log "Aha! we just matched #{n.text}!" }
|
325
|
+
# rule :or_maybe, 'oho', process: :some_arity_one_method_in_class_extending_gullah
|
326
|
+
#
|
327
|
+
# rule :tests, 'test me', tests: %i[node structure]
|
328
|
+
def rule(name, body, tests: [], preconditions: [], process: nil)
|
329
|
+
raise Error, 'tests must be an array' unless tests.is_a? Array
|
330
|
+
raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
|
331
|
+
|
332
|
+
init
|
333
|
+
init_check(name)
|
334
|
+
name = name.to_sym
|
335
|
+
body = body.to_s.strip.gsub(/\s+/, ' ')
|
336
|
+
return if dup_check(:rule, name, body, tests + preconditions)
|
337
|
+
|
338
|
+
tests << [process] if process
|
339
|
+
r = Rule.new name, body, tests: tests, preconditions: preconditions
|
340
|
+
subrules = r.subrules || [r]
|
341
|
+
subrules.each do |sr|
|
342
|
+
@rules << sr
|
343
|
+
sr.starters.each do |r, n|
|
344
|
+
(@starters[r] ||= []) << n
|
345
|
+
end
|
346
|
+
end
|
347
|
+
r.literals.each do |sym|
|
348
|
+
leaf sym.to_s, Regexp.new(quotemeta(sym.to_s))
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
##
|
353
|
+
# Don't make whitespace automatically ignorable.
|
354
|
+
#
|
355
|
+
# class Foo
|
356
|
+
# extend Gullah
|
357
|
+
#
|
358
|
+
# keep_whitespace
|
359
|
+
#
|
360
|
+
# rule :a, 'a+'
|
361
|
+
# leaf :a, /a/
|
362
|
+
# end
|
363
|
+
#
|
364
|
+
# Foo.parse "aaa aaa"
|
365
|
+
#
|
366
|
+
# In this example, the parse tree would consist of two a nodes, each parent to three 'a' leaves,
|
367
|
+
# separated by a "trash" node corresponding to the whitespace, for which no leaf rule was provided.
|
368
|
+
def keep_whitespace
|
369
|
+
@keep_whitespace = true
|
370
|
+
end
|
371
|
+
|
372
|
+
##
|
373
|
+
# A tokenization rule to divide the raw text into tokens to by matched by rules.
|
374
|
+
#
|
375
|
+
# The required arguments are a name and a regular expression. The name is what other
|
376
|
+
# rules will refer to. The regular expression of course defines the character sequence
|
377
|
+
# the rule matches. The more precise the regular expression the fewer false possibilities
|
378
|
+
# Gullah will have to sort through to find the best parse(s). Boundary markers in
|
379
|
+
# particular, +\b+ or lookarounds such as <tt>(?<!\d)</tt>, are helpful in this regard.
|
380
|
+
#
|
381
|
+
# The optional arguments are +tests+ and +process+. See +rule+ for more regarding these.
|
382
|
+
#
|
383
|
+
# leaf :word, /\b\w+\b/
|
384
|
+
# leaf :integer, /(?<!\d)[1-9]\d*(?!=\d)/, process: ->(n) { n.atts[:val] = n.text.to_i }
|
385
|
+
# leaf :name, /Bob/, tests: [:not_bobbing]
|
386
|
+
#
|
387
|
+
# def not_bobbing(n)
|
388
|
+
# /bing/.match(n.full_text, n.end) ? :fail : :pass
|
389
|
+
# end
|
390
|
+
def leaf(name, rx, tests: [], preconditions: [], process: nil)
|
391
|
+
_leaf name, rx, ignorable: false, tests: tests, process: process, preconditions: preconditions
|
392
|
+
end
|
393
|
+
|
394
|
+
##
|
395
|
+
# A tokenization rule like +leaf+, but whose tokens are invisible to other rules.
|
396
|
+
# The +ignore+ method is otherwise identical to +leaf+.
|
397
|
+
#
|
398
|
+
# Unless +keep_whitespace+ is called, an +ignore+ rule covering whitespace will be
|
399
|
+
# generated automatically. It's name will be "_ws", or, if that is taken, "_wsN", where
|
400
|
+
# N is an integer sufficient to make this name unique among the rules of the grammar.
|
401
|
+
def ignore(name, rx, tests: [], preconditions: [], process: nil)
|
402
|
+
_leaf name, rx, ignorable: true, tests: tests, process: process, preconditions: []
|
403
|
+
end
|
404
|
+
|
405
|
+
##
|
406
|
+
# A tokenization rule like +leaf+, but whose tokens cannot be the children of other nodes.
|
407
|
+
# The +ignore+ method is otherwise identical to +leaf+.
|
408
|
+
#
|
409
|
+
# Boundaries are extremely valuable for reducing the complexity of parsing, because Gullah
|
410
|
+
# knows no parse can span a boundary. Trash nodes -- nodes that correspond to character
|
411
|
+
# sequences unmatched by any leaf rule -- are also boundaries, though most likely erroneous
|
412
|
+
# ones.
|
413
|
+
#
|
414
|
+
# # clause boundary pattern
|
415
|
+
# boundary :terminal, /[.!?](?=\s*\z|\s+"?\p{Lu})|[:;]/
|
416
|
+
def boundary(name, rx, tests: [], preconditions: [], process: nil)
|
417
|
+
_leaf name, rx, boundary: true, tests: tests, preconditions: preconditions, process: process
|
418
|
+
end
|
419
|
+
|
420
|
+
##
|
421
|
+
# Obtain the set of optimal parses of the given text. Optimality is determined
|
422
|
+
# by four criteria. In every case the smaller the number the better.
|
423
|
+
#
|
424
|
+
# correctness:: The count of node or structure tests that have failed.
|
425
|
+
# completion:: The count of root nodes.
|
426
|
+
# pending:: The count of structure tests that were not applied.
|
427
|
+
# size:: The total number of nodes.
|
428
|
+
#
|
429
|
+
# You can adjust the optimality conditions only by removing them via the optional
|
430
|
+
# +filters+ argument. If you supply this argument, only the optimality criteria you
|
431
|
+
# specify will be applied. The order of application is fixed: if parse A is more
|
432
|
+
# correct than parse B, it will be kept and B discarded even if B is more complete,
|
433
|
+
# has fewer pending tests, and fewer nodes.
|
434
|
+
#
|
435
|
+
# The optional +n+ parameter can be used to specify the desired number of parses.
|
436
|
+
# This is useful if your parse rules are ambiguous. For example, consider the grammar
|
437
|
+
#
|
438
|
+
# class Binary
|
439
|
+
# extend Gullah
|
440
|
+
# rule :a, 'a{2}'
|
441
|
+
# leaf :a, /\S+/
|
442
|
+
# end
|
443
|
+
#
|
444
|
+
# If you ask this to parse the string "a b c d e f g h i j k l" it will produce
|
445
|
+
# 58,786 equally good parses. These will consume a lot of memory and producing them
|
446
|
+
# will consume a lot of time. The +n+ parameter will let you get on with things faster.
|
447
|
+
#
|
448
|
+
# A caveat: Because of the way Gullah works you may not get exactly +n+ parses
|
449
|
+
# back when you ask for +n+. There may not be sufficiently many parses, of course, but
|
450
|
+
# you may also get back more than +n+ parses if the text you are parsing contains
|
451
|
+
# parsing boundaries. Gullah parses the portions of text inside the boundaries separately,
|
452
|
+
# so the number of possible parses will be the product of the number of parses of
|
453
|
+
# each bounded segment. If you have a sentence boundary in the middle of your text,
|
454
|
+
# and thus two segments, the number of parses of the entire text will be the number
|
455
|
+
# of parses of the first segment times the number of parses of the second. If the first
|
456
|
+
# has two parses and the second also has two but you ask for 3, the number of parses
|
457
|
+
# Gullah will find as it goes will be 1, then 2, then 4. There is no iteration of the
|
458
|
+
# process in which Gullah has found exactly 3 parses. The 4 it has found are necessarily
|
459
|
+
# all equally good, so rather than arbitrarily choosing 3 and discarding one, Gullah
|
460
|
+
# will return all 4.
|
461
|
+
def parse(text, filters: %i[correctness completion pending size], n: nil)
|
462
|
+
raise Error, 'n must be positive' if n&.zero?
|
463
|
+
|
464
|
+
commit
|
465
|
+
segments = segment(text.to_s, filters, n)
|
466
|
+
initial_segments = segments.select { |s| s.start.zero? }
|
467
|
+
if n
|
468
|
+
# iterate till all segments done or we get >= n parses
|
469
|
+
# another place to start parallelization
|
470
|
+
while (s = segments.reject(&:done).min_by(&:weight))
|
471
|
+
break if s.next && initial_segments.sum(&:total_parses) >= n
|
472
|
+
end
|
473
|
+
else
|
474
|
+
# iterate till all segments done
|
475
|
+
# NOTE: could be parallelized
|
476
|
+
while (s = segments.find { |s| !s.done })
|
477
|
+
s.next
|
478
|
+
end
|
479
|
+
end
|
480
|
+
if segments.length > 1
|
481
|
+
# pass the results through a new hopper to filter out duds
|
482
|
+
hopper = Hopper.new filters, nil
|
483
|
+
initial_segments.flat_map(&:results).each { |p| hopper << p }
|
484
|
+
hopper.dump.each(&:initialize_summaries)
|
485
|
+
else
|
486
|
+
segments.first.results
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
##
|
491
|
+
# The first parse found. This takes the same arguments as +parse+ minus +n+.
|
492
|
+
# If there are no parses without errors or unsatisfied pending tree structure
|
493
|
+
# tests, it will be the first erroneous or incomplete parse.
|
494
|
+
#
|
495
|
+
# If you expect the parse to succeed and be unambiguous, this is the method you
|
496
|
+
# want.
|
497
|
+
def first(text, filters: %i[correctness completion pending size])
|
498
|
+
parse(text, filters: filters, n: 1).first
|
499
|
+
end
|
500
|
+
|
501
|
+
# :stopdoc:
|
502
|
+
|
503
|
+
private
|
504
|
+
|
505
|
+
def init
|
506
|
+
return if iv_check :@rules
|
507
|
+
|
508
|
+
@rules = []
|
509
|
+
@leaves = []
|
510
|
+
@starters = {}
|
511
|
+
@tests = {}
|
512
|
+
@preconditions = {}
|
513
|
+
@committed = false
|
514
|
+
@do_unary_branch_check = nil
|
515
|
+
end
|
516
|
+
|
517
|
+
def iv_check(var)
|
518
|
+
v = instance_variable_defined?(var) && instance_variable_get(var)
|
519
|
+
v && block_given? ? yield(v) : v
|
520
|
+
end
|
521
|
+
|
522
|
+
# do some sanity checking, initialization, and optimization
|
523
|
+
def commit
|
524
|
+
return if iv_check(:@committed)
|
525
|
+
raise Error, "#{name} has no leaves" unless iv_check(:@leaves, &:any?)
|
526
|
+
|
527
|
+
# add the whitespace rule unless told otherwise
|
528
|
+
if iv_check(:@keep_whitespace)
|
529
|
+
remove_instance_variable :@keep_whitespace
|
530
|
+
else
|
531
|
+
used_rules = (@rules.map(&:name) + @leaves.map(&:name)).to_set
|
532
|
+
base = '_ws'
|
533
|
+
count = nil
|
534
|
+
count = count.to_i + 1 while used_rules.include? "#{base}#{count}".to_sym
|
535
|
+
_leaf "#{base}#{count}".to_sym, /\s+/, ignorable: true
|
536
|
+
end
|
537
|
+
|
538
|
+
# vet on commit so rule definition is order-independent
|
539
|
+
[@leaves, @rules].flatten.each do |r|
|
540
|
+
vetted_tests = r.tests.map { |t| vet t }
|
541
|
+
vetted_preconds = r.preconditions.map { |pc| vet_precondition pc }
|
542
|
+
r._post_init(vetted_tests, vetted_preconds)
|
543
|
+
end
|
544
|
+
completeness_check
|
545
|
+
loop_check
|
546
|
+
# arrange things so we first try rules that can complete more of the parse;
|
547
|
+
# better would be sorting by frequency in parse trees, but we don't have
|
548
|
+
# that information
|
549
|
+
@starters.transform_values { |atoms| atoms.sort_by(&:max_consumption).reverse }
|
550
|
+
remove_instance_variable :@leaf_dup_check if iv_check(:@leaf_dup_check)
|
551
|
+
remove_instance_variable :@rule_dup_check if iv_check(:@rule_dup_check)
|
552
|
+
@committed = true
|
553
|
+
end
|
554
|
+
|
555
|
+
# has every rule/leaf required by some rule been defined?
|
556
|
+
def completeness_check
|
557
|
+
available = (@rules + @leaves).map(&:name).to_set
|
558
|
+
sought = @rules.flat_map(&:seeking).uniq.to_set
|
559
|
+
problems = sought.reject { |s| available.include? s }
|
560
|
+
raise Error, "the following rules or leaves remain undefined: #{problems.join(', ')}" if problems.any?
|
561
|
+
end
|
562
|
+
|
563
|
+
# define the @do_unary_branch_check variable
|
564
|
+
def loop_check
|
565
|
+
@do_unary_branch_check = false
|
566
|
+
links = @rules.select(&:potentially_unary?).flat_map(&:branches).uniq
|
567
|
+
if links.any?
|
568
|
+
potential_loops = links.map { |l| LoopCheck.new l }
|
569
|
+
catch :looped do
|
570
|
+
while potential_loops.any?
|
571
|
+
new_potential_loops = []
|
572
|
+
links.each do |l|
|
573
|
+
potential_loops.each do |pl|
|
574
|
+
if (npl = pl.add(l, self))
|
575
|
+
new_potential_loops << npl
|
576
|
+
end
|
577
|
+
end
|
578
|
+
end
|
579
|
+
potential_loops = new_potential_loops
|
580
|
+
end
|
581
|
+
end
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
class LoopCheck
|
586
|
+
def initialize(link)
|
587
|
+
@seen = Set.new(link)
|
588
|
+
@seeking = link.last
|
589
|
+
end
|
590
|
+
|
591
|
+
def add(link, grammar)
|
592
|
+
if @seeking == link.first
|
593
|
+
if @seen.include? link.last
|
594
|
+
grammar.instance_variable_set :@do_unary_branch_check, true
|
595
|
+
throw :looped
|
596
|
+
end
|
597
|
+
LoopCheck.new(@seen.to_a + [link.last])
|
598
|
+
end
|
599
|
+
end
|
600
|
+
end
|
601
|
+
|
602
|
+
def init_check(name)
|
603
|
+
raise Error, "cannot define #{name}; all rules must be defined before parsing" if @committed
|
604
|
+
end
|
605
|
+
|
606
|
+
# a tokenization rule to divide the raw text into tokens and separators ("ignorable" tokens)
|
607
|
+
def _leaf(name, rx, ignorable: false, boundary: false, tests: [], preconditions: [], process: nil)
|
608
|
+
raise Error, 'tests must be an array' unless tests.is_a? Array
|
609
|
+
raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
|
610
|
+
|
611
|
+
init
|
612
|
+
init_check(name)
|
613
|
+
name = name.to_sym
|
614
|
+
return if dup_check(:leaf, name, rx, tests + preconditions)
|
615
|
+
|
616
|
+
tests << [process] if process
|
617
|
+
@leaves << Leaf.new(name, rx, ignorable: ignorable, boundary: boundary, tests: tests, preconditions: preconditions)
|
618
|
+
end
|
619
|
+
|
620
|
+
# convert raw text into one or more arrays of leaf nodes -- maximally unreduced parses
|
621
|
+
def lex(text)
|
622
|
+
bases = [[0, Parse.new(text)]]
|
623
|
+
done = []
|
624
|
+
while bases.any?
|
625
|
+
offset, parse = bases.shift
|
626
|
+
added_any = false
|
627
|
+
@leaves.each do |leaf|
|
628
|
+
# can this leaf rule extract a leaf at this offset?
|
629
|
+
next unless (md = leaf.rx.match(text, offset)) && md.begin(0) == offset
|
630
|
+
|
631
|
+
e = md.end(0)
|
632
|
+
next if leaf.preconditions.any? { |pc| pc.call(leaf.name, offset, e, text, []) == :fail }
|
633
|
+
|
634
|
+
added_any = true
|
635
|
+
new_parse = parse.add(offset, e, leaf, @do_unary_branch_check, false, leaf.boundary)
|
636
|
+
if e == text.length
|
637
|
+
done << new_parse
|
638
|
+
else
|
639
|
+
bases << [e, new_parse]
|
640
|
+
end
|
641
|
+
end
|
642
|
+
next if added_any
|
643
|
+
|
644
|
+
# try to eliminate trash
|
645
|
+
trash_offset = text.length
|
646
|
+
@leaves.each do |leaf|
|
647
|
+
# is there a leaf like this closer to the current offset?
|
648
|
+
next unless
|
649
|
+
(md = leaf.rx.match(text, offset)) &&
|
650
|
+
(b = md.begin(0)) &&
|
651
|
+
(b < trash_offset) &&
|
652
|
+
(e = md.end(0)) &&
|
653
|
+
leaf.preconditions.none? { |pc| pc.call(leaf.name, b, e, text, []) == :fail }
|
654
|
+
|
655
|
+
trash_offset = b
|
656
|
+
end
|
657
|
+
new_parse = parse.add(offset, trash_offset, trash_rule, false, true)
|
658
|
+
if trash_offset == text.length
|
659
|
+
done << new_parse
|
660
|
+
else
|
661
|
+
bases << [trash_offset, new_parse]
|
662
|
+
end
|
663
|
+
end
|
664
|
+
done # an array of Parses
|
665
|
+
end
|
666
|
+
|
667
|
+
# slice text into independent segments
|
668
|
+
def segment(text, filters, n)
|
669
|
+
uncollected_segments = lex(text).flat_map(&:split)
|
670
|
+
segments = uncollected_segments.group_by { |s| [s.start, s.end] }.values.map do |segs|
|
671
|
+
Segment.new segs, filters, @starters, @do_unary_branch_check, n
|
672
|
+
end
|
673
|
+
segments.group_by(&:end).each do |final_offset, segs|
|
674
|
+
continuations = segments.select { |s| s.start == final_offset }
|
675
|
+
segs.each { |s| s.continuations = continuations }
|
676
|
+
end
|
677
|
+
segments
|
678
|
+
end
|
679
|
+
|
680
|
+
def trash_rule
|
681
|
+
@trash_rule ||= Leaf.new(:"", nil)
|
682
|
+
end
|
683
|
+
|
684
|
+
def singleton
|
685
|
+
@singleton ||= new
|
686
|
+
end
|
687
|
+
|
688
|
+
# check for duplicate rule/leaf
|
689
|
+
# return true if perfect duplicate, false if novel
|
690
|
+
def dup_check(type, name, body, tests)
|
691
|
+
set = type == :leaf ? (@leaf_dup_check ||= ::Set.new) : (@rule_dup_check ||= ::Set.new)
|
692
|
+
key = [name, body, tests.sort]
|
693
|
+
if set.include? key
|
694
|
+
true
|
695
|
+
else
|
696
|
+
set << key
|
697
|
+
false
|
698
|
+
end
|
699
|
+
end
|
700
|
+
|
701
|
+
# vet tests
|
702
|
+
def vet(test)
|
703
|
+
if test.is_a? Array
|
704
|
+
# this is a processing function, not a real test
|
705
|
+
return procify(test.first)
|
706
|
+
end
|
707
|
+
|
708
|
+
@tests[test] ||= begin
|
709
|
+
begin
|
710
|
+
m = singleton.method(test)
|
711
|
+
rescue ::NameError
|
712
|
+
raise Error, "#{test} is not defined"
|
713
|
+
end
|
714
|
+
raise Error, "#{test} must take either one or two arguments" unless (1..2).include? m.arity
|
715
|
+
|
716
|
+
m
|
717
|
+
end
|
718
|
+
end
|
719
|
+
|
720
|
+
# vet preconditions
|
721
|
+
def vet_precondition(precond)
|
722
|
+
@preconditions[precond] ||= begin
|
723
|
+
begin
|
724
|
+
m = singleton.method(precond)
|
725
|
+
rescue ::NameError
|
726
|
+
raise Error, "#{precond} is not defined"
|
727
|
+
end
|
728
|
+
raise Error, <<-MESSAGE.strip.gsub(/\s+/, ' ') unless m.arity == 5
|
729
|
+
#{precond} must take four arguments:
|
730
|
+
the rule or leaf name,
|
731
|
+
the start character offset,
|
732
|
+
the end character offset,
|
733
|
+
the text being parsed,
|
734
|
+
and the prospective children
|
735
|
+
MESSAGE
|
736
|
+
|
737
|
+
m
|
738
|
+
end
|
739
|
+
end
|
740
|
+
|
741
|
+
# escape a string literal for use in a regex
|
742
|
+
def quotemeta(str)
|
743
|
+
quoted = ''
|
744
|
+
(0...str.length).each do |i|
|
745
|
+
c = str[i]
|
746
|
+
quoted += '\\' if c =~ /[{}()\[\].?+*\\^$]/
|
747
|
+
quoted += c
|
748
|
+
end
|
749
|
+
quoted
|
750
|
+
end
|
751
|
+
|
752
|
+
def procify(processor)
|
753
|
+
case processor
|
754
|
+
when Symbol
|
755
|
+
@tests[processor] ||= begin
|
756
|
+
begin
|
757
|
+
m = singleton.method(processor)
|
758
|
+
rescue ::NameError
|
759
|
+
raise Error, "#{processor} is not defined"
|
760
|
+
end
|
761
|
+
raise Error, "#{processor} can only take a single argument" unless m.arity == 1
|
762
|
+
|
763
|
+
lambda { |n|
|
764
|
+
m.call(n) unless n.error?
|
765
|
+
return :ignore
|
766
|
+
}
|
767
|
+
end
|
768
|
+
when Proc
|
769
|
+
lambda { |n|
|
770
|
+
processor.call(n) unless n.error?
|
771
|
+
return :ignore
|
772
|
+
}
|
773
|
+
else
|
774
|
+
raise Error, 'a node processor can only be a proc or a symbol'
|
775
|
+
end
|
776
|
+
end
|
777
|
+
end
|