gullah 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/LICENSE +21 -0
- data/README.md +87 -0
- data/Rakefile +11 -0
- data/TODO.md +2 -0
- data/examples/hat.rb +27 -0
- data/examples/trash.rb +42 -0
- data/examples/xml.rb +45 -0
- data/gullah.gemspec +31 -0
- data/lib/gullah/atom.rb +132 -0
- data/lib/gullah/boundary.rb +11 -0
- data/lib/gullah/dotifier.rb +127 -0
- data/lib/gullah/error.rb +7 -0
- data/lib/gullah/hopper.rb +142 -0
- data/lib/gullah/iterator.rb +67 -0
- data/lib/gullah/leaf.rb +24 -0
- data/lib/gullah/node.rb +553 -0
- data/lib/gullah/parse.rb +233 -0
- data/lib/gullah/picker.rb +56 -0
- data/lib/gullah/rule.rb +90 -0
- data/lib/gullah/segment.rb +92 -0
- data/lib/gullah/trash.rb +15 -0
- data/lib/gullah/version.rb +7 -0
- data/lib/gullah.rb +777 -0
- data/test/basic_test.rb +451 -0
- data/test/big_tree_test.rb +26 -0
- data/test/boundary_test.rb +29 -0
- data/test/date_test.rb +111 -0
- data/test/error_test.rb +245 -0
- data/test/json_test.rb +124 -0
- data/test/parse_demo_test.rb +33 -0
- data/test/precondition_test.rb +68 -0
- data/test/tests_per_subrule_test.rb +49 -0
- data/test/tree_walking_test.rb +88 -0
- metadata +157 -0
data/lib/gullah/trash.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# a node just for trash
|
5
|
+
class Trash < Node # :nodoc:
|
6
|
+
# does this node represent a character sequence no leaf rule matched?
|
7
|
+
def trash?
|
8
|
+
true
|
9
|
+
end
|
10
|
+
|
11
|
+
def boundary?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/gullah.rb
ADDED
@@ -0,0 +1,777 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
%w[version atom error hopper leaf node trash boundary parse rule iterator dotifier segment picker].each do |s|
|
5
|
+
require "gullah/#{s}"
|
6
|
+
end
|
7
|
+
|
8
|
+
# A collection of class methods that can be added into a class to make it parser.
|
9
|
+
# For example:
|
10
|
+
#
|
11
|
+
# class Foo
|
12
|
+
# extend Gullah
|
13
|
+
#
|
14
|
+
# rule :plugh, 'foo bar+ | bar foo{1,3}'
|
15
|
+
# rule :foo, 'number word'
|
16
|
+
# rule :bar, 'punctuation "wow!"'
|
17
|
+
# leaf :word, /[a-z]+/i
|
18
|
+
# leaf :number, /\d+(?:\.\d+)?/
|
19
|
+
# leaf :punctuation, /[^\w\s]+/
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# Having defined a grammar like this, one can apply it to arbitrary strings to
|
23
|
+
# generate parse trees:
|
24
|
+
#
|
25
|
+
# Foo.parse "123 cat @#$ wow! ___wow!"
|
26
|
+
#
|
27
|
+
# Gullah can produce parse trees from incomplete or ambiguous grammars. It can handle
|
28
|
+
# noisy data. One can apply arbitrary tests to parse nodes, including tests that
|
29
|
+
# depend on other nodes in the parse tree. In the case of test failure the nature
|
30
|
+
# of the failure is marked on the corresponding nodes in the parse tree.
|
31
|
+
#
|
32
|
+
# = Syntax
|
33
|
+
#
|
34
|
+
# This section describes only the syntax of Gullah rules, not the entire API. Gullah
|
35
|
+
# syntax is generally the more familiar subset of the rules of regular expressions.
|
36
|
+
#
|
37
|
+
# - <b>sequence</b>
|
38
|
+
#
|
39
|
+
# rule :foo, 'bar baz' # one thing follows another
|
40
|
+
#
|
41
|
+
# - <b>alternation</b>
|
42
|
+
#
|
43
|
+
# rule :foo, 'bar | baz' # separate alternates with pipes
|
44
|
+
# rule :foo, 'plugh+' # or simply define it additional times (not regex grammar)
|
45
|
+
#
|
46
|
+
# Note, you can define all alternates by simple redefinition as in the second line
|
47
|
+
# above. You can use the pipe syntax for convenience. Any tests or preconditions
|
48
|
+
# provided with a particular definition of the rule <em>will apply only for that definition</em>.
|
49
|
+
#
|
50
|
+
# - <b>repetition</b>
|
51
|
+
#
|
52
|
+
# rule :option, 'foo?' # ? means "one or none"
|
53
|
+
# rule :plural, 'foo+' # + means "one or more"
|
54
|
+
# rule :options, 'foo*' # * means "zero or more"
|
55
|
+
# rule :n, 'foo{2}' # {n} means "exactly n"
|
56
|
+
# rule :n_plus, 'foo{2,}' # {n,} means "n or more"
|
57
|
+
# rule :n_m, 'foo{2,3}' # {n,m} means "between n and m"
|
58
|
+
#
|
59
|
+
# Note, though you can define rules like +option+ and +options+, a rule can't add
|
60
|
+
# a node to the parse tree if it matches nothing. These repetition suffixes are
|
61
|
+
# are more useful as part of a sequence. In practice <tt>foo?</tt> will be a less
|
62
|
+
# efficient version of <tt>foo</tt>, and <tt>foo*</tt>, a less efficient version of
|
63
|
+
# <tt>foo+</tt>.
|
64
|
+
#
|
65
|
+
# - <b>literals</b>
|
66
|
+
#
|
67
|
+
# rule :foo, '"(" bar ")"'
|
68
|
+
#
|
69
|
+
# Literals allow you to avoid defining simple leaf rules. The above is basically
|
70
|
+
# shorthand for
|
71
|
+
#
|
72
|
+
# rule :foo, 'left_paren bar right_paren'
|
73
|
+
# leaf :left_paren, /\(/
|
74
|
+
# leaf :right_paren, /\)/
|
75
|
+
#
|
76
|
+
# You may use either single or double quotes to define literals. You may also use
|
77
|
+
# escape sequences to include random characters in literals. Literals may have
|
78
|
+
# repetition suffixes.
|
79
|
+
#
|
80
|
+
# - <b>grouping</b>
|
81
|
+
#
|
82
|
+
# rule :foo, 'bar baz'
|
83
|
+
#
|
84
|
+
# Surprise! There is no grouping syntax in Gullah. Every rule is in effect a named group.
|
85
|
+
# So it might be better said that there are no anonymous groups in Gullah and grouping
|
86
|
+
# doesn't involve parentheses.
|
87
|
+
#
|
88
|
+
# You may be wondering about whitespace handling. See +ignore+ and +keep_whitespace+ below.
|
89
|
+
# The short version of it is that Gullah creates an ignorable whitespace leaf rule by
|
90
|
+
# default.
|
91
|
+
#
|
92
|
+
# = Preconditions
|
93
|
+
#
|
94
|
+
# The first step in adding a node to a parse tree is collecting a sequence of child
|
95
|
+
# nodes that match some rule. If the rule is
|
96
|
+
#
|
97
|
+
# rule :foo, 'bar+'
|
98
|
+
#
|
99
|
+
# you've collected a sequence of +bar+ nodes. If there is some condition you need this
|
100
|
+
# node to respect <em>which is dependent only on the rule and the child nodes</em> which you
|
101
|
+
# can't express, or not easily, in the rule itself, you can define one or more
|
102
|
+
# preconditions. E.g.,
|
103
|
+
#
|
104
|
+
# rule :foo, 'bar+', preconditions: %i[fibonacci]
|
105
|
+
#
|
106
|
+
# def fibonacci(_name, _start, _end, _text, children)
|
107
|
+
# is_fibonacci_number? children.length # assumes we've defined is_fibonacci_number?
|
108
|
+
# end
|
109
|
+
#
|
110
|
+
# A precondition is just an instance method defined in the Gullah-fied class with an arity
|
111
|
+
# of two: it takes the rule's name, a symbol, as its first argument, the start and end character
|
112
|
+
# offsets of the match as its second and third arguments, the text being parsed as its fourth argument,
|
113
|
+
# and the prospective child nodes, an array, as its last argument. If it returns a truthy value, the
|
114
|
+
# precondition holds and the node can be made. Otherwise, Gullah tries the next thing.
|
115
|
+
#
|
116
|
+
# == Preconditions versus Tests
|
117
|
+
#
|
118
|
+
# Preconditions are like tests (see below). They are further conditions on the building of
|
119
|
+
# nodes in a parse tree. Why does Gullah provide both? There are several reasons:
|
120
|
+
#
|
121
|
+
# - Preconditions are tested before the node is built, avoiding the overhead of cloning
|
122
|
+
# nodes, so they are considerably lighter weight.
|
123
|
+
# - Because they are tested <em>before</em> the node is built, they result in no partially erroneous
|
124
|
+
# parse in the event of failure, so they leave nothing Gullah will attempt to improve further
|
125
|
+
# at the cost of time.
|
126
|
+
# - But they don't leave a trace, so there's nothing to examine in the event of failure.
|
127
|
+
# - And they concern only the subtree rooted at the prospective node, so they cannot express
|
128
|
+
# structural relationships between this node and nodes which do not descend from it.
|
129
|
+
#
|
130
|
+
# <b>Note</b>: preconditions cannot tests relationships between <em>nodes</em> outside the prospective node's
|
131
|
+
# subtree, but they can test its relationships to adjoining <em>characters</em>, so they can
|
132
|
+
# implement lookarounds. For instance:
|
133
|
+
#
|
134
|
+
# def colon_after(_rule_or_leaf_name, _start_offset, end_offset, text, _children)
|
135
|
+
# text[end_offset..-1] =~ /\A\s*:/ # equivalent to (?=\s*:)
|
136
|
+
# end
|
137
|
+
#
|
138
|
+
# = Tests
|
139
|
+
#
|
140
|
+
# rule :all, 'baz+', tests: %i[half_even]
|
141
|
+
# rule :baz, 'foo | bar'
|
142
|
+
#
|
143
|
+
# leaf :foo, /\d+/, tests: %i[xenophilia]
|
144
|
+
# leaf :bar, /[a-z]/i, tests: %i[xenophilia]
|
145
|
+
#
|
146
|
+
# # node test!
|
147
|
+
#
|
148
|
+
# # half the digit characters under this node must be even, half, odd
|
149
|
+
# def half_even(node)
|
150
|
+
# even, odd = node.text.chars.select { |c| c =~ /\d/ }.partition { |c| c.to_i.even? }
|
151
|
+
# even.length == odd.length ? :pass : :fail
|
152
|
+
# end
|
153
|
+
#
|
154
|
+
# # structure test!
|
155
|
+
#
|
156
|
+
# # foos need bars and bars need foos
|
157
|
+
# def xenophilia(root, node)
|
158
|
+
# if root.name == :all
|
159
|
+
# sought = node.name == :foo ? :bar : :foo
|
160
|
+
# root.descendants.any? { |n| n.name == sought } ? :pass : :fail
|
161
|
+
# end
|
162
|
+
# end
|
163
|
+
#
|
164
|
+
# A special feature of Gullah is that you can add arbitrary tests to its rules. For example
|
165
|
+
# you can use a simple regular expression to match a date and then a test to do a sanity
|
166
|
+
# check to confirm that the parts of the date, the year, month, and day, combine to produce
|
167
|
+
# a real date on the calendar. This is better than simply writing a thorough regular expression
|
168
|
+
# because it gives you the opportunity to tell the user *how* a match failed rather than simply
|
169
|
+
# that it failed. This feature is Gullah's answer to such things as lookarounds and back
|
170
|
+
# references: you've matched a simple pattern; now does this pattern fit sanely with its context?
|
171
|
+
#
|
172
|
+
# There are two sorts of tests: node tests and structure tests. Node tests are tests that need
|
173
|
+
# only the node itself and its subtree as inputs. Structure tests are tests that depend on
|
174
|
+
# elements of the parse tree outside of the subtree rooted at the node itself.
|
175
|
+
#
|
176
|
+
# Tests are implemented as instance methods of the Gullah-fied class. If the method has an arity
|
177
|
+
# of one, it is a node test. Its single argument is the node matched. If it has an arity of two,
|
178
|
+
# it is a structure test. The first argument is an ancestor node of the node corresponding to the
|
179
|
+
# rule. The second argument is the node itself. Because structure tests cannot be run until the
|
180
|
+
# node has some ancestor, and then they might not apply to all ancestors, they can be in a "pending"
|
181
|
+
# state, where the test is queued to run but has not yet run.
|
182
|
+
#
|
183
|
+
# Tests must return one of four values: +:pass+, +:fail+, +:ignore+, or +nil+. Only structure
|
184
|
+
# tests may return +nil+, which indicates that the preconditions for the test have not yet been
|
185
|
+
# met. If a structure test returns +nil+, the test remains in a pending state and it will be run
|
186
|
+
# again when the node acquires a new ancestor.
|
187
|
+
#
|
188
|
+
# If a node test passes, the node is accepted into the parse tree. If it fails, the node is marked
|
189
|
+
# as erroneous and the particular cause of its failure is marked in the abstract syntax tree. If
|
190
|
+
# this tree is returned to the user, they will see this information. In addition to +:fail+, the rule
|
191
|
+
# may return more specific explanatory information:
|
192
|
+
#
|
193
|
+
# rule :word, /\w+/, tests: %i[we_want_foo!]
|
194
|
+
#
|
195
|
+
# def we_want_foo!(n)
|
196
|
+
# if n.text =~ /foo/
|
197
|
+
# :pass
|
198
|
+
# else
|
199
|
+
# [:fail, %Q[we really wanted to see "foo" but all we got was #{n.text.inspect}]]
|
200
|
+
# end
|
201
|
+
# end
|
202
|
+
#
|
203
|
+
# If a node returns +:pass+, the fact that the node passed the rule in question will be added to
|
204
|
+
# its +attributes+ hash in the AST.
|
205
|
+
#
|
206
|
+
# If a rule returns +:ignore+, this will constitute a pass, but no edits will be made to the AST.
|
207
|
+
#
|
208
|
+
# Tests short-circuit! If a node has many tests, they run until one fails.
|
209
|
+
#
|
210
|
+
# == Disadvantages of Tests
|
211
|
+
#
|
212
|
+
# All this being said, when tests <em>fail</em> they do so after their node has been built and added
|
213
|
+
# to a parse. This means their partially broken parse remains a candidate as Gullah tries to
|
214
|
+
# find the least bad way to parse the text it was given. This can be computationally expensive.
|
215
|
+
# If you can make do with preconditions (see above), they are the better choice.
|
216
|
+
#
|
217
|
+
# = Processors
|
218
|
+
#
|
219
|
+
# rule :word, /[a-z]+/i, process: :abbrv
|
220
|
+
# leaf :integer, /[1-9]\d*/, process: ->(n) { n.atts[:val] = n.text.to_i }
|
221
|
+
#
|
222
|
+
# def abbrv(node)
|
223
|
+
# node.attributes[:abbreviation] = node.text.gsub(/(?<!^)[aeiou]/, '')[0...5]
|
224
|
+
# end
|
225
|
+
#
|
226
|
+
# Any rule may have a +process+ named argument whose value is either a proc or a symbol.
|
227
|
+
# If it is a symbol, it must be the name of an instance method of the Gullah-fied class.
|
228
|
+
# In either case, the arity of the code in question must be one: its single argument will
|
229
|
+
# be the node created by the rule.
|
230
|
+
#
|
231
|
+
# The processing code may do anything -- log the event, provide a breakpoint -- but its
|
232
|
+
# expected use is to calculate and store some attribute of the node or its subtree in the
|
233
|
+
# node's attribute hash, most likely to accelerate other tests that will depend on this
|
234
|
+
# value. You may use this mechanism for other purposes, of course, to compile the text
|
235
|
+
# parsed into a more useful object, say, but because processing may occur on nodes which
|
236
|
+
# are later discarded in failed parses, it may be more efficient to defer such handling
|
237
|
+
# of the AST until the parse completes.
|
238
|
+
#
|
239
|
+
# Processors run after any tests have completed and only if they all pass.
|
240
|
+
#
|
241
|
+
# = Motivation
|
242
|
+
#
|
243
|
+
# Why does Gullah exist? Well, mostly because it seemed like fun to make it. I have made
|
244
|
+
# other grammar-adjacent things -- a recursive descent parser in Java inspired by the grammars
|
245
|
+
# of Raku, various regular expression optimization libraries in various languages, a simple
|
246
|
+
# grammar-esque regular expression enhancer for Rust that produces abstract syntax trees but
|
247
|
+
# can't handle recursion -- so I was thinking about the topic. A problem I faced with the recursive
|
248
|
+
# descent parser, which I later learned was a well-known problem, was infinite left-recursion.
|
249
|
+
# If you have a rule such as <tt>X -> X Y | Z</tt>, where an +X+ can be made of other +X+ es, your recursive
|
250
|
+
# descent parser constructs an infinitely long plan that never touches the data -- "I'll try an X, which
|
251
|
+
# means I'll first try an X, which means I'll first try an X..." The solution to this is to create an
|
252
|
+
# arbitrary, perhaps adjustable, recursion limit, recognize this pattern of recursion, and bail out
|
253
|
+
# when you find you've planned too long without executing anything. This is how I solved the problem in
|
254
|
+
# the library I wrote, but I found this unsatisfactory.
|
255
|
+
#
|
256
|
+
# An alternative solution, it occurred to me, was to start with the data rather than the plan. "I have
|
257
|
+
# an +X+. What can I make with this?" This instantly solves the left recursion problem, because the application
|
258
|
+
# of a rule must consume nodes, and it seems like a more
|
259
|
+
# reasonable way to parse things generally. As a latent linguist, this appealed to me as more psychologically
|
260
|
+
# realistic. Certainly people understand words in part by approaching language with expectations -- the top-down
|
261
|
+
# pattern you see in recursive descent -- but people are constantly confronted with text begun in the middle or
|
262
|
+
# interrupted or repaired mid-sentence, so they must be able as well to take the words they hear and try to
|
263
|
+
# make something from them. So I wanted to make a data-driven, bottom-up parser.
|
264
|
+
#
|
265
|
+
# (One thing I should say up front is that the design of Gullah is based entirely on my own pondering. I am not
|
266
|
+
# a very enthusiastic reader of other people's research. I am aware that a lot of work has been done on
|
267
|
+
# parsing and parser design, but the fun for me is in coming up with the ideas more than in doing the background
|
268
|
+
# reading, so I have just dived in. I am sure I have reinvented some wheels in this, mostly likely badly.)
|
269
|
+
#
|
270
|
+
# (Another aside: The left-recursion problem disappears with a bottom-up parser, which must consume data to proceed, but it
|
271
|
+
# is replaced with a unary-branching problem. If you have a rule that says an +A+ can be relabeled +B+ -- that
|
272
|
+
# is, you can add a node with a single child -- you risk an infinite loop. You may define rules such that +A+ becomes
|
273
|
+
# +B+, and another rule, or series of rules, which turns this +B+ back into an +A+. So this bottom-up parser has
|
274
|
+
# a somewhat unsatisfactory loop check as well.)
|
275
|
+
#
|
276
|
+
# A side benefit of bottom-up parsing is that it is robust against ill-formed data. If you can't make what you
|
277
|
+
# set out to make at least you can make something. And the structure you build out of the data can show very
|
278
|
+
# clearly where it has gone wrong. As a linguist, this appealed to my desire to model natural languages with
|
279
|
+
# all their noise and redundancy. As a programmer, this appealed to me as a way to make data problems
|
280
|
+
# transparent and solvable.
|
281
|
+
#
|
282
|
+
# = Efficiency
|
283
|
+
#
|
284
|
+
# I have taken care to make rules fail fast and have followed a dynamic programming model in which I cache
|
285
|
+
# information which would otherwise be recalculated in many recursions, but Gullah is certainly not as
|
286
|
+
# efficient as a parser custom designed for a particular language. A SAX parser of XML, for example, can
|
287
|
+
# process its input in linear time by pushing half-processed constructs onto a stack. The general mechanism
|
288
|
+
# underlying Gullah is worst-case quadratic, because events already seen may have to be scanned again to
|
289
|
+
# see whether recent decisions have changed whether they can be handled. If every node added to a
|
290
|
+
# provisional parse tree reduces the unprocessed node count by one and every scan on average finishes
|
291
|
+
# halfway through the unhandled nodes, this would mean n(n - 1)/2 comparisons to complete the tree. I doubt,
|
292
|
+
# though I cannot prove, that one could improve on this while maintaining one's parser's ability to handle
|
293
|
+
# broken data or ambiguous grammars. Ranking rules to try next based on past experience in the tree
|
294
|
+
# might improve the speed of parse discovery, but at the cost of greater complexity in the handling of any
|
295
|
+
# single scan.
|
296
|
+
#
|
297
|
+
# So if you have a particular data format or language you want to handle efficiently and you expect in most
|
298
|
+
# cases you will succeed without ambiguity on a single pass, Gullah is not the tool you want. But if you
|
299
|
+
# want to recover gracefully, it may be that a second pass with Gullah to produce the least bad parse and
|
300
|
+
# some information about how things went wrong is useful.
|
301
|
+
|
302
|
+
module Gullah
|
303
|
+
##
|
304
|
+
# Define a tree structure rule. This specifies how tree nodes may be grouped under
|
305
|
+
# another node. The required arguments are +name+ and +body+. The former is a label
|
306
|
+
# for the node under which the others are grouped. The latter is a string defining
|
307
|
+
# the rule.
|
308
|
+
#
|
309
|
+
# rule :sequence, 'this then this'
|
310
|
+
#
|
311
|
+
# rule :quantifiers, 'foo bar? baz* plugh+ qux{2} quux{3,} corge{4,5}'
|
312
|
+
#
|
313
|
+
# rule :alternates, 'this | or | that'
|
314
|
+
# # you may also add alternates like so
|
315
|
+
# rule :alternates, 'also | these | and | those'
|
316
|
+
# rule :alternates, 'etc'
|
317
|
+
#
|
318
|
+
# rule :literals, %['this' "that"]
|
319
|
+
#
|
320
|
+
# rule :escapes, 'foo\\? "bar\\""'
|
321
|
+
#
|
322
|
+
# # the optional named arguments:
|
323
|
+
#
|
324
|
+
# rule :process, 'aha', process: ->(n) { log "Aha! we just matched #{n.text}!" }
|
325
|
+
# rule :or_maybe, 'oho', process: :some_arity_one_method_in_class_extending_gullah
|
326
|
+
#
|
327
|
+
# rule :tests, 'test me', tests: %i[node structure]
|
328
|
+
def rule(name, body, tests: [], preconditions: [], process: nil)
|
329
|
+
raise Error, 'tests must be an array' unless tests.is_a? Array
|
330
|
+
raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
|
331
|
+
|
332
|
+
init
|
333
|
+
init_check(name)
|
334
|
+
name = name.to_sym
|
335
|
+
body = body.to_s.strip.gsub(/\s+/, ' ')
|
336
|
+
return if dup_check(:rule, name, body, tests + preconditions)
|
337
|
+
|
338
|
+
tests << [process] if process
|
339
|
+
r = Rule.new name, body, tests: tests, preconditions: preconditions
|
340
|
+
subrules = r.subrules || [r]
|
341
|
+
subrules.each do |sr|
|
342
|
+
@rules << sr
|
343
|
+
sr.starters.each do |r, n|
|
344
|
+
(@starters[r] ||= []) << n
|
345
|
+
end
|
346
|
+
end
|
347
|
+
r.literals.each do |sym|
|
348
|
+
leaf sym.to_s, Regexp.new(quotemeta(sym.to_s))
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
##
|
353
|
+
# Don't make whitespace automatically ignorable.
|
354
|
+
#
|
355
|
+
# class Foo
|
356
|
+
# extend Gullah
|
357
|
+
#
|
358
|
+
# keep_whitespace
|
359
|
+
#
|
360
|
+
# rule :a, 'a+'
|
361
|
+
# leaf :a, /a/
|
362
|
+
# end
|
363
|
+
#
|
364
|
+
# Foo.parse "aaa aaa"
|
365
|
+
#
|
366
|
+
# In this example, the parse tree would consist of two a nodes, each parent to three 'a' leaves,
|
367
|
+
# separated by a "trash" node corresponding to the whitespace, for which no leaf rule was provided.
|
368
|
+
def keep_whitespace
|
369
|
+
@keep_whitespace = true
|
370
|
+
end
|
371
|
+
|
372
|
+
##
|
373
|
+
# A tokenization rule to divide the raw text into tokens to by matched by rules.
|
374
|
+
#
|
375
|
+
# The required arguments are a name and a regular expression. The name is what other
|
376
|
+
# rules will refer to. The regular expression of course defines the character sequence
|
377
|
+
# the rule matches. The more precise the regular expression the fewer false possibilities
|
378
|
+
# Gullah will have to sort through to find the best parse(s). Boundary markers in
|
379
|
+
# particular, +\b+ or lookarounds such as <tt>(?<!\d)</tt>, are helpful in this regard.
|
380
|
+
#
|
381
|
+
# The optional arguments are +tests+ and +process+. See +rule+ for more regarding these.
|
382
|
+
#
|
383
|
+
# leaf :word, /\b\w+\b/
|
384
|
+
# leaf :integer, /(?<!\d)[1-9]\d*(?!=\d)/, process: ->(n) { n.atts[:val] = n.text.to_i }
|
385
|
+
# leaf :name, /Bob/, tests: [:not_bobbing]
|
386
|
+
#
|
387
|
+
# def not_bobbing(n)
|
388
|
+
# /bing/.match(n.full_text, n.end) ? :fail : :pass
|
389
|
+
# end
|
390
|
+
def leaf(name, rx, tests: [], preconditions: [], process: nil)
|
391
|
+
_leaf name, rx, ignorable: false, tests: tests, process: process, preconditions: preconditions
|
392
|
+
end
|
393
|
+
|
394
|
+
##
|
395
|
+
# A tokenization rule like +leaf+, but whose tokens are invisible to other rules.
|
396
|
+
# The +ignore+ method is otherwise identical to +leaf+.
|
397
|
+
#
|
398
|
+
# Unless +keep_whitespace+ is called, an +ignore+ rule covering whitespace will be
|
399
|
+
# generated automatically. It's name will be "_ws", or, if that is taken, "_wsN", where
|
400
|
+
# N is an integer sufficient to make this name unique among the rules of the grammar.
|
401
|
+
def ignore(name, rx, tests: [], preconditions: [], process: nil)
|
402
|
+
_leaf name, rx, ignorable: true, tests: tests, process: process, preconditions: []
|
403
|
+
end
|
404
|
+
|
405
|
+
##
|
406
|
+
# A tokenization rule like +leaf+, but whose tokens cannot be the children of other nodes.
|
407
|
+
# The +ignore+ method is otherwise identical to +leaf+.
|
408
|
+
#
|
409
|
+
# Boundaries are extremely valuable for reducing the complexity of parsing, because Gullah
|
410
|
+
# knows no parse can span a boundary. Trash nodes -- nodes that correspond to character
|
411
|
+
# sequences unmatched by any leaf rule -- are also boundaries, though most likely erroneous
|
412
|
+
# ones.
|
413
|
+
#
|
414
|
+
# # clause boundary pattern
|
415
|
+
# boundary :terminal, /[.!?](?=\s*\z|\s+"?\p{Lu})|[:;]/
|
416
|
+
def boundary(name, rx, tests: [], preconditions: [], process: nil)
|
417
|
+
_leaf name, rx, boundary: true, tests: tests, preconditions: preconditions, process: process
|
418
|
+
end
|
419
|
+
|
420
|
+
##
|
421
|
+
# Obtain the set of optimal parses of the given text. Optimality is determined
|
422
|
+
# by four criteria. In every case the smaller the number the better.
|
423
|
+
#
|
424
|
+
# correctness:: The count of node or structure tests that have failed.
|
425
|
+
# completion:: The count of root nodes.
|
426
|
+
# pending:: The count of structure tests that were not applied.
|
427
|
+
# size:: The total number of nodes.
|
428
|
+
#
|
429
|
+
# You can adjust the optimality conditions only by removing them via the optional
|
430
|
+
# +filters+ argument. If you supply this argument, only the optimality criteria you
|
431
|
+
# specify will be applied. The order of application is fixed: if parse A is more
|
432
|
+
# correct than parse B, it will be kept and B discarded even if B is more complete,
|
433
|
+
# has fewer pending tests, and fewer nodes.
|
434
|
+
#
|
435
|
+
# The optional +n+ parameter can be used to specify the desired number of parses.
|
436
|
+
# This is useful if your parse rules are ambiguous. For example, consider the grammar
|
437
|
+
#
|
438
|
+
# class Binary
|
439
|
+
# extend Gullah
|
440
|
+
# rule :a, 'a{2}'
|
441
|
+
# leaf :a, /\S+/
|
442
|
+
# end
|
443
|
+
#
|
444
|
+
# If you ask this to parse the string "a b c d e f g h i j k l" it will produce
|
445
|
+
# 58,786 equally good parses. These will consume a lot of memory and producing them
|
446
|
+
# will consume a lot of time. The +n+ parameter will let you get on with things faster.
|
447
|
+
#
|
448
|
+
# A caveat: Because of the way Gullah works you may not get exactly +n+ parses
|
449
|
+
# back when you ask for +n+. There may not be sufficiently many parses, of course, but
|
450
|
+
# you may also get back more than +n+ parses if the text you are parsing contains
|
451
|
+
# parsing boundaries. Gullah parses the portions of text inside the boundaries separately,
|
452
|
+
# so the number of possible parses will be the product of the number of parses of
|
453
|
+
# each bounded segment. If you have a sentence boundary in the middle of your text,
|
454
|
+
# and thus two segments, the number of parses of the entire text will be the number
|
455
|
+
# of parses of the first segment times the number of parses of the second. If the first
|
456
|
+
# has two parses and the second also has two but you ask for 3, the number of parses
|
457
|
+
# Gullah will find as it goes will be 1, then 2, then 4. There is no iteration of the
|
458
|
+
# process in which Gullah has found exactly 3 parses. The 4 it has found are necessarily
|
459
|
+
# all equally good, so rather than arbitrarily choosing 3 and discarding one, Gullah
|
460
|
+
# will return all 4.
|
461
|
+
def parse(text, filters: %i[correctness completion pending size], n: nil)
|
462
|
+
raise Error, 'n must be positive' if n&.zero?
|
463
|
+
|
464
|
+
commit
|
465
|
+
segments = segment(text.to_s, filters, n)
|
466
|
+
initial_segments = segments.select { |s| s.start.zero? }
|
467
|
+
if n
|
468
|
+
# iterate till all segments done or we get >= n parses
|
469
|
+
# another place to start parallelization
|
470
|
+
while (s = segments.reject(&:done).min_by(&:weight))
|
471
|
+
break if s.next && initial_segments.sum(&:total_parses) >= n
|
472
|
+
end
|
473
|
+
else
|
474
|
+
# iterate till all segments done
|
475
|
+
# NOTE: could be parallelized
|
476
|
+
while (s = segments.find { |s| !s.done })
|
477
|
+
s.next
|
478
|
+
end
|
479
|
+
end
|
480
|
+
if segments.length > 1
|
481
|
+
# pass the results through a new hopper to filter out duds
|
482
|
+
hopper = Hopper.new filters, nil
|
483
|
+
initial_segments.flat_map(&:results).each { |p| hopper << p }
|
484
|
+
hopper.dump.each(&:initialize_summaries)
|
485
|
+
else
|
486
|
+
segments.first.results
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
##
|
491
|
+
# The first parse found. This takes the same arguments as +parse+ minus +n+.
|
492
|
+
# If there are no parses without errors or unsatisfied pending tree structure
|
493
|
+
# tests, it will be the first erroneous or incomplete parse.
|
494
|
+
#
|
495
|
+
# If you expect the parse to succeed and be unambiguous, this is the method you
|
496
|
+
# want.
|
497
|
+
def first(text, filters: %i[correctness completion pending size])
|
498
|
+
parse(text, filters: filters, n: 1).first
|
499
|
+
end
|
500
|
+
|
501
|
+
# :stopdoc:
|
502
|
+
|
503
|
+
private
|
504
|
+
|
505
|
+
def init
|
506
|
+
return if iv_check :@rules
|
507
|
+
|
508
|
+
@rules = []
|
509
|
+
@leaves = []
|
510
|
+
@starters = {}
|
511
|
+
@tests = {}
|
512
|
+
@preconditions = {}
|
513
|
+
@committed = false
|
514
|
+
@do_unary_branch_check = nil
|
515
|
+
end
|
516
|
+
|
517
|
+
def iv_check(var)
|
518
|
+
v = instance_variable_defined?(var) && instance_variable_get(var)
|
519
|
+
v && block_given? ? yield(v) : v
|
520
|
+
end
|
521
|
+
|
522
|
+
# do some sanity checking, initialization, and optimization
|
523
|
+
def commit
|
524
|
+
return if iv_check(:@committed)
|
525
|
+
raise Error, "#{name} has no leaves" unless iv_check(:@leaves, &:any?)
|
526
|
+
|
527
|
+
# add the whitespace rule unless told otherwise
|
528
|
+
if iv_check(:@keep_whitespace)
|
529
|
+
remove_instance_variable :@keep_whitespace
|
530
|
+
else
|
531
|
+
used_rules = (@rules.map(&:name) + @leaves.map(&:name)).to_set
|
532
|
+
base = '_ws'
|
533
|
+
count = nil
|
534
|
+
count = count.to_i + 1 while used_rules.include? "#{base}#{count}".to_sym
|
535
|
+
_leaf "#{base}#{count}".to_sym, /\s+/, ignorable: true
|
536
|
+
end
|
537
|
+
|
538
|
+
# vet on commit so rule definition is order-independent
|
539
|
+
[@leaves, @rules].flatten.each do |r|
|
540
|
+
vetted_tests = r.tests.map { |t| vet t }
|
541
|
+
vetted_preconds = r.preconditions.map { |pc| vet_precondition pc }
|
542
|
+
r._post_init(vetted_tests, vetted_preconds)
|
543
|
+
end
|
544
|
+
completeness_check
|
545
|
+
loop_check
|
546
|
+
# arrange things so we first try rules that can complete more of the parse;
|
547
|
+
# better would be sorting by frequency in parse trees, but we don't have
|
548
|
+
# that information
|
549
|
+
@starters.transform_values { |atoms| atoms.sort_by(&:max_consumption).reverse }
|
550
|
+
remove_instance_variable :@leaf_dup_check if iv_check(:@leaf_dup_check)
|
551
|
+
remove_instance_variable :@rule_dup_check if iv_check(:@rule_dup_check)
|
552
|
+
@committed = true
|
553
|
+
end
|
554
|
+
|
555
|
+
# has every rule/leaf required by some rule been defined?
|
556
|
+
def completeness_check
|
557
|
+
available = (@rules + @leaves).map(&:name).to_set
|
558
|
+
sought = @rules.flat_map(&:seeking).uniq.to_set
|
559
|
+
problems = sought.reject { |s| available.include? s }
|
560
|
+
raise Error, "the following rules or leaves remain undefined: #{problems.join(', ')}" if problems.any?
|
561
|
+
end
|
562
|
+
|
563
|
+
# define the @do_unary_branch_check variable
|
564
|
+
def loop_check
|
565
|
+
@do_unary_branch_check = false
|
566
|
+
links = @rules.select(&:potentially_unary?).flat_map(&:branches).uniq
|
567
|
+
if links.any?
|
568
|
+
potential_loops = links.map { |l| LoopCheck.new l }
|
569
|
+
catch :looped do
|
570
|
+
while potential_loops.any?
|
571
|
+
new_potential_loops = []
|
572
|
+
links.each do |l|
|
573
|
+
potential_loops.each do |pl|
|
574
|
+
if (npl = pl.add(l, self))
|
575
|
+
new_potential_loops << npl
|
576
|
+
end
|
577
|
+
end
|
578
|
+
end
|
579
|
+
potential_loops = new_potential_loops
|
580
|
+
end
|
581
|
+
end
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
class LoopCheck
|
586
|
+
def initialize(link)
|
587
|
+
@seen = Set.new(link)
|
588
|
+
@seeking = link.last
|
589
|
+
end
|
590
|
+
|
591
|
+
def add(link, grammar)
|
592
|
+
if @seeking == link.first
|
593
|
+
if @seen.include? link.last
|
594
|
+
grammar.instance_variable_set :@do_unary_branch_check, true
|
595
|
+
throw :looped
|
596
|
+
end
|
597
|
+
LoopCheck.new(@seen.to_a + [link.last])
|
598
|
+
end
|
599
|
+
end
|
600
|
+
end
|
601
|
+
|
602
|
+
def init_check(name)
|
603
|
+
raise Error, "cannot define #{name}; all rules must be defined before parsing" if @committed
|
604
|
+
end
|
605
|
+
|
606
|
+
# a tokenization rule to divide the raw text into tokens and separators ("ignorable" tokens)
|
607
|
+
def _leaf(name, rx, ignorable: false, boundary: false, tests: [], preconditions: [], process: nil)
|
608
|
+
raise Error, 'tests must be an array' unless tests.is_a? Array
|
609
|
+
raise Error, 'preconditions must be an array' unless preconditions.is_a? Array
|
610
|
+
|
611
|
+
init
|
612
|
+
init_check(name)
|
613
|
+
name = name.to_sym
|
614
|
+
return if dup_check(:leaf, name, rx, tests + preconditions)
|
615
|
+
|
616
|
+
tests << [process] if process
|
617
|
+
@leaves << Leaf.new(name, rx, ignorable: ignorable, boundary: boundary, tests: tests, preconditions: preconditions)
|
618
|
+
end
|
619
|
+
|
620
|
+
# convert raw text into one or more arrays of leaf nodes -- maximally unreduced parses
|
621
|
+
def lex(text)
|
622
|
+
bases = [[0, Parse.new(text)]]
|
623
|
+
done = []
|
624
|
+
while bases.any?
|
625
|
+
offset, parse = bases.shift
|
626
|
+
added_any = false
|
627
|
+
@leaves.each do |leaf|
|
628
|
+
# can this leaf rule extract a leaf at this offset?
|
629
|
+
next unless (md = leaf.rx.match(text, offset)) && md.begin(0) == offset
|
630
|
+
|
631
|
+
e = md.end(0)
|
632
|
+
next if leaf.preconditions.any? { |pc| pc.call(leaf.name, offset, e, text, []) == :fail }
|
633
|
+
|
634
|
+
added_any = true
|
635
|
+
new_parse = parse.add(offset, e, leaf, @do_unary_branch_check, false, leaf.boundary)
|
636
|
+
if e == text.length
|
637
|
+
done << new_parse
|
638
|
+
else
|
639
|
+
bases << [e, new_parse]
|
640
|
+
end
|
641
|
+
end
|
642
|
+
next if added_any
|
643
|
+
|
644
|
+
# try to eliminate trash
|
645
|
+
trash_offset = text.length
|
646
|
+
@leaves.each do |leaf|
|
647
|
+
# is there a leaf like this closer to the current offset?
|
648
|
+
next unless
|
649
|
+
(md = leaf.rx.match(text, offset)) &&
|
650
|
+
(b = md.begin(0)) &&
|
651
|
+
(b < trash_offset) &&
|
652
|
+
(e = md.end(0)) &&
|
653
|
+
leaf.preconditions.none? { |pc| pc.call(leaf.name, b, e, text, []) == :fail }
|
654
|
+
|
655
|
+
trash_offset = b
|
656
|
+
end
|
657
|
+
new_parse = parse.add(offset, trash_offset, trash_rule, false, true)
|
658
|
+
if trash_offset == text.length
|
659
|
+
done << new_parse
|
660
|
+
else
|
661
|
+
bases << [trash_offset, new_parse]
|
662
|
+
end
|
663
|
+
end
|
664
|
+
done # an array of Parses
|
665
|
+
end
|
666
|
+
|
667
|
+
# slice text into independent segments
|
668
|
+
def segment(text, filters, n)
|
669
|
+
uncollected_segments = lex(text).flat_map(&:split)
|
670
|
+
segments = uncollected_segments.group_by { |s| [s.start, s.end] }.values.map do |segs|
|
671
|
+
Segment.new segs, filters, @starters, @do_unary_branch_check, n
|
672
|
+
end
|
673
|
+
segments.group_by(&:end).each do |final_offset, segs|
|
674
|
+
continuations = segments.select { |s| s.start == final_offset }
|
675
|
+
segs.each { |s| s.continuations = continuations }
|
676
|
+
end
|
677
|
+
segments
|
678
|
+
end
|
679
|
+
|
680
|
+
def trash_rule
|
681
|
+
@trash_rule ||= Leaf.new(:"", nil)
|
682
|
+
end
|
683
|
+
|
684
|
+
def singleton
|
685
|
+
@singleton ||= new
|
686
|
+
end
|
687
|
+
|
688
|
+
# check for duplicate rule/leaf
|
689
|
+
# return true if perfect duplicate, false if novel
|
690
|
+
def dup_check(type, name, body, tests)
|
691
|
+
set = type == :leaf ? (@leaf_dup_check ||= ::Set.new) : (@rule_dup_check ||= ::Set.new)
|
692
|
+
key = [name, body, tests.sort]
|
693
|
+
if set.include? key
|
694
|
+
true
|
695
|
+
else
|
696
|
+
set << key
|
697
|
+
false
|
698
|
+
end
|
699
|
+
end
|
700
|
+
|
701
|
+
# vet tests
|
702
|
+
def vet(test)
|
703
|
+
if test.is_a? Array
|
704
|
+
# this is a processing function, not a real test
|
705
|
+
return procify(test.first)
|
706
|
+
end
|
707
|
+
|
708
|
+
@tests[test] ||= begin
|
709
|
+
begin
|
710
|
+
m = singleton.method(test)
|
711
|
+
rescue ::NameError
|
712
|
+
raise Error, "#{test} is not defined"
|
713
|
+
end
|
714
|
+
raise Error, "#{test} must take either one or two arguments" unless (1..2).include? m.arity
|
715
|
+
|
716
|
+
m
|
717
|
+
end
|
718
|
+
end
|
719
|
+
|
720
|
+
# vet preconditions
|
721
|
+
def vet_precondition(precond)
|
722
|
+
@preconditions[precond] ||= begin
|
723
|
+
begin
|
724
|
+
m = singleton.method(precond)
|
725
|
+
rescue ::NameError
|
726
|
+
raise Error, "#{precond} is not defined"
|
727
|
+
end
|
728
|
+
raise Error, <<-MESSAGE.strip.gsub(/\s+/, ' ') unless m.arity == 5
|
729
|
+
#{precond} must take four arguments:
|
730
|
+
the rule or leaf name,
|
731
|
+
the start character offset,
|
732
|
+
the end character offset,
|
733
|
+
the text being parsed,
|
734
|
+
and the prospective children
|
735
|
+
MESSAGE
|
736
|
+
|
737
|
+
m
|
738
|
+
end
|
739
|
+
end
|
740
|
+
|
741
|
+
# escape a string literal for use in a regex
|
742
|
+
def quotemeta(str)
|
743
|
+
quoted = ''
|
744
|
+
(0...str.length).each do |i|
|
745
|
+
c = str[i]
|
746
|
+
quoted += '\\' if c =~ /[{}()\[\].?+*\\^$]/
|
747
|
+
quoted += c
|
748
|
+
end
|
749
|
+
quoted
|
750
|
+
end
|
751
|
+
|
752
|
+
def procify(processor)
|
753
|
+
case processor
|
754
|
+
when Symbol
|
755
|
+
@tests[processor] ||= begin
|
756
|
+
begin
|
757
|
+
m = singleton.method(processor)
|
758
|
+
rescue ::NameError
|
759
|
+
raise Error, "#{processor} is not defined"
|
760
|
+
end
|
761
|
+
raise Error, "#{processor} can only take a single argument" unless m.arity == 1
|
762
|
+
|
763
|
+
lambda { |n|
|
764
|
+
m.call(n) unless n.error?
|
765
|
+
return :ignore
|
766
|
+
}
|
767
|
+
end
|
768
|
+
when Proc
|
769
|
+
lambda { |n|
|
770
|
+
processor.call(n) unless n.error?
|
771
|
+
return :ignore
|
772
|
+
}
|
773
|
+
else
|
774
|
+
raise Error, 'a node processor can only be a proc or a symbol'
|
775
|
+
end
|
776
|
+
end
|
777
|
+
end
|