gullah 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/LICENSE +21 -0
- data/README.md +87 -0
- data/Rakefile +11 -0
- data/TODO.md +2 -0
- data/examples/hat.rb +27 -0
- data/examples/trash.rb +42 -0
- data/examples/xml.rb +45 -0
- data/gullah.gemspec +31 -0
- data/lib/gullah/atom.rb +132 -0
- data/lib/gullah/boundary.rb +11 -0
- data/lib/gullah/dotifier.rb +127 -0
- data/lib/gullah/error.rb +7 -0
- data/lib/gullah/hopper.rb +142 -0
- data/lib/gullah/iterator.rb +67 -0
- data/lib/gullah/leaf.rb +24 -0
- data/lib/gullah/node.rb +553 -0
- data/lib/gullah/parse.rb +233 -0
- data/lib/gullah/picker.rb +56 -0
- data/lib/gullah/rule.rb +90 -0
- data/lib/gullah/segment.rb +92 -0
- data/lib/gullah/trash.rb +15 -0
- data/lib/gullah/version.rb +7 -0
- data/lib/gullah.rb +777 -0
- data/test/basic_test.rb +451 -0
- data/test/big_tree_test.rb +26 -0
- data/test/boundary_test.rb +29 -0
- data/test/date_test.rb +111 -0
- data/test/error_test.rb +245 -0
- data/test/json_test.rb +124 -0
- data/test/parse_demo_test.rb +33 -0
- data/test/precondition_test.rb +68 -0
- data/test/tests_per_subrule_test.rb +49 -0
- data/test/tree_walking_test.rb +88 -0
- metadata +157 -0
data/lib/gullah/parse.rb
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
##
|
5
|
+
# A parse is the collection of root nodes produced by parsing a text.
|
6
|
+
#
|
7
|
+
# class Example
|
8
|
+
# extend Gullah
|
9
|
+
#
|
10
|
+
# rule :S, 'NP VP'
|
11
|
+
# rule :NP, 'D N'
|
12
|
+
# rule :VP, 'V'
|
13
|
+
#
|
14
|
+
# leaf :D, /the/
|
15
|
+
# leaf :N, /cat/
|
16
|
+
# leaf :V, /sat/
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# parses = Example.parse 'the cat sat', n: 1
|
20
|
+
#
|
21
|
+
# # this is a Parse
|
22
|
+
# parse = parses.first
|
23
|
+
# puts parse.length # => 1
|
24
|
+
# puts parse.size # => 8
|
25
|
+
# puts parse.summary # => S[NP[D,_ws,N],_ws,VP[V]]
|
26
|
+
#
|
27
|
+
class Parse
|
28
|
+
##
|
29
|
+
# The root nodes of all subtrees found in this parse in sequence. This is an array.
|
30
|
+
attr_reader :roots
|
31
|
+
|
32
|
+
# The text parsed by this parse.
|
33
|
+
attr_reader :text
|
34
|
+
|
35
|
+
# A concise stringification of the syntactic structure of this parse.
|
36
|
+
# For a given string and grammar all the parses will have a unique
|
37
|
+
# summary.
|
38
|
+
attr_reader :summary
|
39
|
+
|
40
|
+
def initialize(text) # :nodoc:
|
41
|
+
@roots = []
|
42
|
+
@text = text
|
43
|
+
end
|
44
|
+
|
45
|
+
# produce a clone of this parse with a new node with the given offsets and rule
|
46
|
+
def add(s, e, rule, loop_check, trash = false, boundary = false) # :nodoc:
|
47
|
+
clone.tap do |b|
|
48
|
+
b._roots = roots.map(&:clone)
|
49
|
+
cz = if trash
|
50
|
+
Trash
|
51
|
+
elsif boundary
|
52
|
+
Boundary
|
53
|
+
else
|
54
|
+
Node
|
55
|
+
end
|
56
|
+
n = cz.new(b, s, e, rule)
|
57
|
+
return nil if loop_check && n._loop_check?
|
58
|
+
|
59
|
+
if n.leaf?
|
60
|
+
b.roots << n
|
61
|
+
else
|
62
|
+
b.roots[s...e] = [n]
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
##
|
68
|
+
# The number of root nodes in this parse. This is *not* the same as size.
|
69
|
+
def length
|
70
|
+
roots.length
|
71
|
+
end
|
72
|
+
|
73
|
+
##
|
74
|
+
# The total number of nodes in this parse. This is *not* the same as length.
|
75
|
+
def size
|
76
|
+
@size ||= roots.sum(&:size)
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# The count of nodes that failed some test. Structure tests mark both the child
|
81
|
+
# and the ancestor node where the test was run as erroneous,
|
82
|
+
# so they will increase the +incorrectness_count+ by 2.
|
83
|
+
def incorrectness_count
|
84
|
+
@incorrectness_count ||= roots.select(&:failed?).count
|
85
|
+
end
|
86
|
+
|
87
|
+
##
|
88
|
+
# The count of nodes which have some structure test which was never
|
89
|
+
# successfully run.
|
90
|
+
def pending_count
|
91
|
+
@pending_count ||= roots.select(&:pending_tests?).count
|
92
|
+
end
|
93
|
+
|
94
|
+
##
|
95
|
+
# Are there any nodes in the parse that are erroneous, either because
|
96
|
+
# some test failed or because they correspond to "trash" -- characters
|
97
|
+
# that matched no leaf rule?
|
98
|
+
def errors?
|
99
|
+
incorrectness_count.positive?
|
100
|
+
end
|
101
|
+
|
102
|
+
##
|
103
|
+
# Are all leaves accounted for without errors and have all tests passed?
|
104
|
+
def success?
|
105
|
+
!errors? && roots.all? { |n| n.ignorable? || n.nonterminal? && !n.pending_tests? }
|
106
|
+
end
|
107
|
+
|
108
|
+
##
|
109
|
+
# Not a +success?+
|
110
|
+
def failure?
|
111
|
+
!success?
|
112
|
+
end
|
113
|
+
|
114
|
+
# a simplified representation for debugging
|
115
|
+
# "so" = "significant only"
|
116
|
+
def dbg(so: false)
|
117
|
+
roots.map { |n| n.dbg so: so }
|
118
|
+
end
|
119
|
+
|
120
|
+
##
|
121
|
+
# return an enumeration of all the nodes in the parse.
|
122
|
+
#
|
123
|
+
# parses = Grammar.parse "this grammar uses the usual whitespace rule"
|
124
|
+
#
|
125
|
+
# parses.first.nodes.select { |n| n.name == :_ws }.count # => 6
|
126
|
+
def nodes
|
127
|
+
NodeIterator.new self
|
128
|
+
end
|
129
|
+
|
130
|
+
def clone # :nodoc:
|
131
|
+
super.tap do |c|
|
132
|
+
%i[@summary @size @incorrectness_count @pending_count].each do |v|
|
133
|
+
c.remove_instance_variable v if c.instance_variable_defined?(v)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
##
|
139
|
+
# The start offset of the first leaf in the parse.
|
140
|
+
def start
|
141
|
+
roots.first.start
|
142
|
+
end
|
143
|
+
|
144
|
+
##
|
145
|
+
# The end offset of the last leaf in the parse.
|
146
|
+
def end
|
147
|
+
roots.last.end
|
148
|
+
end
|
149
|
+
|
150
|
+
## ADVISORILY PRIVATE
|
151
|
+
|
152
|
+
# :stopdoc:
|
153
|
+
|
154
|
+
# for debugging
|
155
|
+
def own_text
|
156
|
+
text[start...self.end]
|
157
|
+
end
|
158
|
+
|
159
|
+
# make a new parse whose first part is this parse's nodes and whose
|
160
|
+
# second part is the later parse's nodes
|
161
|
+
def merge(later)
|
162
|
+
self.class.new(text).tap do |merged|
|
163
|
+
merged._roots = roots + later.roots
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
# split the parse into segments and boundaries
|
168
|
+
def split
|
169
|
+
last_index = 0
|
170
|
+
splits = []
|
171
|
+
|
172
|
+
# look for traversible sequences and boundaries
|
173
|
+
roots.each_with_index do |n, i|
|
174
|
+
next if n.traversible?
|
175
|
+
|
176
|
+
if i > last_index
|
177
|
+
# sometimes you can have two boundaries in a row,
|
178
|
+
# or you can begin with a boundary
|
179
|
+
segment = Parse.new text
|
180
|
+
segment._roots = roots[last_index...i]
|
181
|
+
splits << segment.initialize_summaries
|
182
|
+
end
|
183
|
+
|
184
|
+
# create boundary element
|
185
|
+
segment = Parse.new text
|
186
|
+
segment._roots = [n]
|
187
|
+
splits << segment.initialize_summaries
|
188
|
+
last_index = i + 1
|
189
|
+
end
|
190
|
+
return [initialize_summaries] if last_index.zero?
|
191
|
+
|
192
|
+
if last_index < roots.length
|
193
|
+
segment = Parse.new text
|
194
|
+
segment._roots = roots[last_index...roots.length]
|
195
|
+
splits << segment.initialize_summaries
|
196
|
+
end
|
197
|
+
splits
|
198
|
+
end
|
199
|
+
|
200
|
+
def _roots=(roots)
|
201
|
+
@roots = roots
|
202
|
+
end
|
203
|
+
|
204
|
+
# it would be conceptually simpler to lazily initialize the summary, but this
|
205
|
+
# gives us a speed boost
|
206
|
+
def initialize_summaries
|
207
|
+
@summary = roots.each { |n| n._summary = n.name unless n.summary }.map(&:summary).join(';')
|
208
|
+
self
|
209
|
+
end
|
210
|
+
|
211
|
+
def _summary=(str)
|
212
|
+
@summary = str
|
213
|
+
end
|
214
|
+
|
215
|
+
class NodeIterator # :nodoc:
|
216
|
+
include Enumerable
|
217
|
+
|
218
|
+
def initialize(parse)
|
219
|
+
@parse = parse
|
220
|
+
end
|
221
|
+
|
222
|
+
def each(&block)
|
223
|
+
@parse.roots.each do |root|
|
224
|
+
root.subtree.each(&block)
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def last
|
229
|
+
@parse.roots.last.leaves.last
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# a Picker keeps a sorted set of iterators so we can always pick the iterator
|
5
|
+
# most likely to lead quickly to a satisfactory parse
|
6
|
+
class Picker # :nodoc:
|
7
|
+
def initialize
|
8
|
+
# a sorted list of the
|
9
|
+
@error_counts = []
|
10
|
+
@error_lists = []
|
11
|
+
@size_count_list = []
|
12
|
+
end
|
13
|
+
|
14
|
+
# add an iterator
|
15
|
+
def <<(iterator)
|
16
|
+
e_idx = iterator.errors
|
17
|
+
s_idx = iterator.length
|
18
|
+
e_list = @error_lists[e_idx] ||= []
|
19
|
+
list = e_list[s_idx] ||= []
|
20
|
+
sc_list = @size_count_list[e_idx] ||= []
|
21
|
+
if (i = @error_counts.bsearch_index { |c| c >= e_idx })
|
22
|
+
# *may* have to add this error count
|
23
|
+
@error_counts.insert i, e_idx if @error_counts[i] != e_idx
|
24
|
+
else
|
25
|
+
# this is a bigger error count than we currently have
|
26
|
+
@error_counts << e_idx
|
27
|
+
end
|
28
|
+
if (i = sc_list.bsearch_index { |c| c >= s_idx })
|
29
|
+
# *may* have to add this size
|
30
|
+
sc_list.insert i, s_idx if sc_list[i] != s_idx
|
31
|
+
else
|
32
|
+
# this size is bigger than we currently have for this error count
|
33
|
+
sc_list << s_idx
|
34
|
+
end
|
35
|
+
# finally, we stow the iterator
|
36
|
+
list << iterator
|
37
|
+
end
|
38
|
+
|
39
|
+
# remove the best iterator
|
40
|
+
def pop
|
41
|
+
error_idx = @error_counts.first
|
42
|
+
return nil unless error_idx
|
43
|
+
|
44
|
+
error_list = @error_lists[error_idx]
|
45
|
+
size_idx = @size_count_list[error_idx].first
|
46
|
+
size_list = error_list[size_idx]
|
47
|
+
iterator = size_list.pop
|
48
|
+
# remove indices if they're used up
|
49
|
+
if size_list.empty?
|
50
|
+
@size_count_list[error_idx].shift
|
51
|
+
@error_counts.shift if @size_count_list[error_idx].empty?
|
52
|
+
end
|
53
|
+
iterator
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/gullah/rule.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# a non-terminal grammatical rule
|
5
|
+
class Rule # :nodoc:
|
6
|
+
# name -- a symbol identifying the rule
|
7
|
+
# body -- preserved for debugging
|
8
|
+
# tests -- tests that must be run after a match to determine whether the node is a keeper
|
9
|
+
# ancestor_tests -- tests that must be run after an ancestor match
|
10
|
+
# subrules/atoms -- if you have no subrules, you have a sequence of atoms
|
11
|
+
attr_reader :name, :body, :tests, :ancestor_tests, :subrules, :atoms, :preconditions
|
12
|
+
|
13
|
+
def initialize(name, body, tests: [], preconditions: [])
|
14
|
+
@name = name
|
15
|
+
@body = body
|
16
|
+
@tests = tests
|
17
|
+
@preconditions = preconditions
|
18
|
+
if body =~ /\|/
|
19
|
+
@subrules = body.split(/ ?\| ?/).map do |subrule|
|
20
|
+
Rule.new(name, subrule, tests: tests)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
@atoms = body.split(/ /).map do |a|
|
24
|
+
Atom.new(a, self)
|
25
|
+
end
|
26
|
+
@atoms.each_with_index do |a, i|
|
27
|
+
a._next = @atoms[i + 1]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# the subrules that may start a match and their atoms
|
33
|
+
def starters
|
34
|
+
if subrules
|
35
|
+
subrules.flat_map(&:starters)
|
36
|
+
else
|
37
|
+
ar = []
|
38
|
+
atoms.each do |a|
|
39
|
+
ar << [a.seeking, a]
|
40
|
+
break if a.required?
|
41
|
+
end
|
42
|
+
ar
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# could this rule participate in a loop?
|
47
|
+
def potentially_unary?
|
48
|
+
if subrules
|
49
|
+
subrules.any?(&:potentially_unary?)
|
50
|
+
else
|
51
|
+
atoms.sum(&:min_repeats) < 2
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# collect all links from a sought symbol to the new name
|
56
|
+
# used in testing for potential infinite loops
|
57
|
+
def branches
|
58
|
+
if subrules
|
59
|
+
subrules.select(&:potentially_unary?).flat_map(&:branches)
|
60
|
+
else
|
61
|
+
atoms.map { |a| [a.seeking, name] }
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# collect all the different rules some atom of this rule might match
|
66
|
+
def seeking
|
67
|
+
if subrules
|
68
|
+
subrules.flat_map(&:seeking).uniq
|
69
|
+
else
|
70
|
+
atoms.map(&:seeking).uniq
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# obtain all the literals required by this rule
|
75
|
+
def literals
|
76
|
+
if subrules
|
77
|
+
subrules.flat_map(&:literals).uniq
|
78
|
+
else
|
79
|
+
atoms.select(&:literal).map(&:seeking).uniq
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
## ADVISORILY PRIVATE
|
84
|
+
|
85
|
+
def _post_init(tests, preconditions)
|
86
|
+
@tests, @ancestor_tests = tests.partition { |m| m.arity == 1 }
|
87
|
+
@preconditions = preconditions
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# a segment handles the portion of a string between boundaries
|
5
|
+
# or a boundary itself
|
6
|
+
class Segment # :nodoc:
|
7
|
+
attr_reader :start, :end, :done
|
8
|
+
attr_accessor :continuations
|
9
|
+
|
10
|
+
def initialize(lexes, filters, starters, do_unary_branch_check, n)
|
11
|
+
# the lexes are all parses that begin and end at the same character offsets
|
12
|
+
# the one with the fewest root nodes is most likely to be correct
|
13
|
+
lexes = lexes.sort_by(&:length)
|
14
|
+
sample = lexes.first
|
15
|
+
@start = sample.start
|
16
|
+
@end = sample.end
|
17
|
+
@continuations = []
|
18
|
+
@mass = lexes.map(&:length).sum
|
19
|
+
@done = false
|
20
|
+
@hopper = Hopper.new(filters, n)
|
21
|
+
@starters = starters
|
22
|
+
@do_unary_branch_check = do_unary_branch_check
|
23
|
+
@bases = Picker.new
|
24
|
+
lexes.each do |p|
|
25
|
+
@bases << Iterator.new(p, @hopper, starters, do_unary_branch_check)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def total_parses
|
30
|
+
if @hopper.size.zero?
|
31
|
+
0
|
32
|
+
elsif continuations.any?
|
33
|
+
continuations.map { |c| c.total_parses * @hopper.size }.sum
|
34
|
+
else
|
35
|
+
@hopper.size
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# used to pick the next segment to iterate
|
40
|
+
def weight
|
41
|
+
@mass * @hopper.size
|
42
|
+
end
|
43
|
+
|
44
|
+
# try to add one parse to the hopper
|
45
|
+
# returns whether or not this succeeded
|
46
|
+
def next
|
47
|
+
return false if @done
|
48
|
+
|
49
|
+
start_size = @hopper.size
|
50
|
+
catch :done do
|
51
|
+
while (iterator = @bases.pop)
|
52
|
+
unless @hopper.continuable?(iterator.parse)
|
53
|
+
@hopper << iterator.parse
|
54
|
+
throw :done if @hopper.satisfied?
|
55
|
+
|
56
|
+
next
|
57
|
+
end
|
58
|
+
|
59
|
+
if (p = iterator.next)
|
60
|
+
@bases << iterator
|
61
|
+
@bases << Iterator.new(p, @hopper, @starters, @do_unary_branch_check)
|
62
|
+
elsif iterator.never_returned_any?
|
63
|
+
# it looks this iterator was based on an unreducible parse
|
64
|
+
@hopper << iterator.parse
|
65
|
+
throw :done if @hopper.satisfied?
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end_size = @hopper.size
|
70
|
+
if end_size == start_size
|
71
|
+
@done = true
|
72
|
+
false
|
73
|
+
else
|
74
|
+
true
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def results
|
79
|
+
@results ||= if continuations.any?
|
80
|
+
@hopper.dump.flat_map do |parse|
|
81
|
+
continuations.flat_map do |c|
|
82
|
+
c.results.flat_map do |p|
|
83
|
+
parse.merge(p)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
else
|
88
|
+
@hopper.dump
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|