gullah 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ ##
5
+ # A parse is the collection of root nodes produced by parsing a text.
6
+ #
7
+ # class Example
8
+ # extend Gullah
9
+ #
10
+ # rule :S, 'NP VP'
11
+ # rule :NP, 'D N'
12
+ # rule :VP, 'V'
13
+ #
14
+ # leaf :D, /the/
15
+ # leaf :N, /cat/
16
+ # leaf :V, /sat/
17
+ # end
18
+ #
19
+ # parses = Example.parse 'the cat sat', n: 1
20
+ #
21
+ # # this is a Parse
22
+ # parse = parses.first
23
+ # puts parse.length # => 1
24
+ # puts parse.size # => 8
25
+ # puts parse.summary # => S[NP[D,_ws,N],_ws,VP[V]]
26
+ #
27
+ class Parse
28
+ ##
29
+ # The root nodes of all subtrees found in this parse in sequence. This is an array.
30
+ attr_reader :roots
31
+
32
+ # The text parsed by this parse.
33
+ attr_reader :text
34
+
35
+ # A concise stringification of the syntactic structure of this parse.
36
+ # For a given string and grammar all the parses will have a unique
37
+ # summary.
38
+ attr_reader :summary
39
+
40
+ def initialize(text) # :nodoc:
41
+ @roots = []
42
+ @text = text
43
+ end
44
+
45
+ # produce a clone of this parse with a new node with the given offsets and rule
46
+ def add(s, e, rule, loop_check, trash = false, boundary = false) # :nodoc:
47
+ clone.tap do |b|
48
+ b._roots = roots.map(&:clone)
49
+ cz = if trash
50
+ Trash
51
+ elsif boundary
52
+ Boundary
53
+ else
54
+ Node
55
+ end
56
+ n = cz.new(b, s, e, rule)
57
+ return nil if loop_check && n._loop_check?
58
+
59
+ if n.leaf?
60
+ b.roots << n
61
+ else
62
+ b.roots[s...e] = [n]
63
+ end
64
+ end
65
+ end
66
+
67
+ ##
68
+ # The number of root nodes in this parse. This is *not* the same as size.
69
+ def length
70
+ roots.length
71
+ end
72
+
73
+ ##
74
+ # The total number of nodes in this parse. This is *not* the same as length.
75
+ def size
76
+ @size ||= roots.sum(&:size)
77
+ end
78
+
79
+ ##
80
+ # The count of nodes that failed some test. Structure tests mark both the child
81
+ # and the ancestor node where the test was run as erroneous,
82
+ # so they will increase the +incorrectness_count+ by 2.
83
+ def incorrectness_count
84
+ @incorrectness_count ||= roots.select(&:failed?).count
85
+ end
86
+
87
+ ##
88
+ # The count of nodes which have some structure test which was never
89
+ # successfully run.
90
+ def pending_count
91
+ @pending_count ||= roots.select(&:pending_tests?).count
92
+ end
93
+
94
+ ##
95
+ # Are there any nodes in the parse that are erroneous, either because
96
+ # some test failed or because they correspond to "trash" -- characters
97
+ # that matched no leaf rule?
98
+ def errors?
99
+ incorrectness_count.positive?
100
+ end
101
+
102
+ ##
103
+ # Are all leaves accounted for without errors and have all tests passed?
104
+ def success?
105
+ !errors? && roots.all? { |n| n.ignorable? || n.nonterminal? && !n.pending_tests? }
106
+ end
107
+
108
+ ##
109
+ # Not a +success?+
110
+ def failure?
111
+ !success?
112
+ end
113
+
114
+ # a simplified representation for debugging
115
+ # "so" = "significant only"
116
+ def dbg(so: false)
117
+ roots.map { |n| n.dbg so: so }
118
+ end
119
+
120
+ ##
121
+ # return an enumeration of all the nodes in the parse.
122
+ #
123
+ # parses = Grammar.parse "this grammar uses the usual whitespace rule"
124
+ #
125
+ # parses.first.nodes.select { |n| n.name == :_ws }.count # => 6
126
+ def nodes
127
+ NodeIterator.new self
128
+ end
129
+
130
+ def clone # :nodoc:
131
+ super.tap do |c|
132
+ %i[@summary @size @incorrectness_count @pending_count].each do |v|
133
+ c.remove_instance_variable v if c.instance_variable_defined?(v)
134
+ end
135
+ end
136
+ end
137
+
138
+ ##
139
+ # The start offset of the first leaf in the parse.
140
+ def start
141
+ roots.first.start
142
+ end
143
+
144
+ ##
145
+ # The end offset of the last leaf in the parse.
146
+ def end
147
+ roots.last.end
148
+ end
149
+
150
+ ## ADVISORILY PRIVATE
151
+
152
+ # :stopdoc:
153
+
154
+ # for debugging
155
+ def own_text
156
+ text[start...self.end]
157
+ end
158
+
159
+ # make a new parse whose first part is this parse's nodes and whose
160
+ # second part is the later parse's nodes
161
+ def merge(later)
162
+ self.class.new(text).tap do |merged|
163
+ merged._roots = roots + later.roots
164
+ end
165
+ end
166
+
167
+ # split the parse into segments and boundaries
168
+ def split
169
+ last_index = 0
170
+ splits = []
171
+
172
+ # look for traversible sequences and boundaries
173
+ roots.each_with_index do |n, i|
174
+ next if n.traversible?
175
+
176
+ if i > last_index
177
+ # sometimes you can have two boundaries in a row,
178
+ # or you can begin with a boundary
179
+ segment = Parse.new text
180
+ segment._roots = roots[last_index...i]
181
+ splits << segment.initialize_summaries
182
+ end
183
+
184
+ # create boundary element
185
+ segment = Parse.new text
186
+ segment._roots = [n]
187
+ splits << segment.initialize_summaries
188
+ last_index = i + 1
189
+ end
190
+ return [initialize_summaries] if last_index.zero?
191
+
192
+ if last_index < roots.length
193
+ segment = Parse.new text
194
+ segment._roots = roots[last_index...roots.length]
195
+ splits << segment.initialize_summaries
196
+ end
197
+ splits
198
+ end
199
+
200
+ def _roots=(roots)
201
+ @roots = roots
202
+ end
203
+
204
+ # it would be conceptually simpler to lazily initialize the summary, but this
205
+ # gives us a speed boost
206
+ def initialize_summaries
207
+ @summary = roots.each { |n| n._summary = n.name unless n.summary }.map(&:summary).join(';')
208
+ self
209
+ end
210
+
211
+ def _summary=(str)
212
+ @summary = str
213
+ end
214
+
215
+ class NodeIterator # :nodoc:
216
+ include Enumerable
217
+
218
+ def initialize(parse)
219
+ @parse = parse
220
+ end
221
+
222
+ def each(&block)
223
+ @parse.roots.each do |root|
224
+ root.subtree.each(&block)
225
+ end
226
+ end
227
+
228
+ def last
229
+ @parse.roots.last.leaves.last
230
+ end
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a Picker keeps a sorted set of iterators so we can always pick the iterator
5
+ # most likely to lead quickly to a satisfactory parse
6
+ class Picker # :nodoc:
7
+ def initialize
8
+ # a sorted list of the
9
+ @error_counts = []
10
+ @error_lists = []
11
+ @size_count_list = []
12
+ end
13
+
14
+ # add an iterator
15
+ def <<(iterator)
16
+ e_idx = iterator.errors
17
+ s_idx = iterator.length
18
+ e_list = @error_lists[e_idx] ||= []
19
+ list = e_list[s_idx] ||= []
20
+ sc_list = @size_count_list[e_idx] ||= []
21
+ if (i = @error_counts.bsearch_index { |c| c >= e_idx })
22
+ # *may* have to add this error count
23
+ @error_counts.insert i, e_idx if @error_counts[i] != e_idx
24
+ else
25
+ # this is a bigger error count than we currently have
26
+ @error_counts << e_idx
27
+ end
28
+ if (i = sc_list.bsearch_index { |c| c >= s_idx })
29
+ # *may* have to add this size
30
+ sc_list.insert i, s_idx if sc_list[i] != s_idx
31
+ else
32
+ # this size is bigger than we currently have for this error count
33
+ sc_list << s_idx
34
+ end
35
+ # finally, we stow the iterator
36
+ list << iterator
37
+ end
38
+
39
+ # remove the best iterator
40
+ def pop
41
+ error_idx = @error_counts.first
42
+ return nil unless error_idx
43
+
44
+ error_list = @error_lists[error_idx]
45
+ size_idx = @size_count_list[error_idx].first
46
+ size_list = error_list[size_idx]
47
+ iterator = size_list.pop
48
+ # remove indices if they're used up
49
+ if size_list.empty?
50
+ @size_count_list[error_idx].shift
51
+ @error_counts.shift if @size_count_list[error_idx].empty?
52
+ end
53
+ iterator
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a non-terminal grammatical rule
5
+ class Rule # :nodoc:
6
+ # name -- a symbol identifying the rule
7
+ # body -- preserved for debugging
8
+ # tests -- tests that must be run after a match to determine whether the node is a keeper
9
+ # ancestor_tests -- tests that must be run after an ancestor match
10
+ # subrules/atoms -- if you have no subrules, you have a sequence of atoms
11
+ attr_reader :name, :body, :tests, :ancestor_tests, :subrules, :atoms, :preconditions
12
+
13
+ def initialize(name, body, tests: [], preconditions: [])
14
+ @name = name
15
+ @body = body
16
+ @tests = tests
17
+ @preconditions = preconditions
18
+ if body =~ /\|/
19
+ @subrules = body.split(/ ?\| ?/).map do |subrule|
20
+ Rule.new(name, subrule, tests: tests)
21
+ end
22
+ else
23
+ @atoms = body.split(/ /).map do |a|
24
+ Atom.new(a, self)
25
+ end
26
+ @atoms.each_with_index do |a, i|
27
+ a._next = @atoms[i + 1]
28
+ end
29
+ end
30
+ end
31
+
32
+ # the subrules that may start a match and their atoms
33
+ def starters
34
+ if subrules
35
+ subrules.flat_map(&:starters)
36
+ else
37
+ ar = []
38
+ atoms.each do |a|
39
+ ar << [a.seeking, a]
40
+ break if a.required?
41
+ end
42
+ ar
43
+ end
44
+ end
45
+
46
+ # could this rule participate in a loop?
47
+ def potentially_unary?
48
+ if subrules
49
+ subrules.any?(&:potentially_unary?)
50
+ else
51
+ atoms.sum(&:min_repeats) < 2
52
+ end
53
+ end
54
+
55
+ # collect all links from a sought symbol to the new name
56
+ # used in testing for potential infinite loops
57
+ def branches
58
+ if subrules
59
+ subrules.select(&:potentially_unary?).flat_map(&:branches)
60
+ else
61
+ atoms.map { |a| [a.seeking, name] }
62
+ end
63
+ end
64
+
65
+ # collect all the different rules some atom of this rule might match
66
+ def seeking
67
+ if subrules
68
+ subrules.flat_map(&:seeking).uniq
69
+ else
70
+ atoms.map(&:seeking).uniq
71
+ end
72
+ end
73
+
74
+ # obtain all the literals required by this rule
75
+ def literals
76
+ if subrules
77
+ subrules.flat_map(&:literals).uniq
78
+ else
79
+ atoms.select(&:literal).map(&:seeking).uniq
80
+ end
81
+ end
82
+
83
+ ## ADVISORILY PRIVATE
84
+
85
+ def _post_init(tests, preconditions)
86
+ @tests, @ancestor_tests = tests.partition { |m| m.arity == 1 }
87
+ @preconditions = preconditions
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a segment handles the portion of a string between boundaries
5
+ # or a boundary itself
6
+ class Segment # :nodoc:
7
+ attr_reader :start, :end, :done
8
+ attr_accessor :continuations
9
+
10
+ def initialize(lexes, filters, starters, do_unary_branch_check, n)
11
+ # the lexes are all parses that begin and end at the same character offsets
12
+ # the one with the fewest root nodes is most likely to be correct
13
+ lexes = lexes.sort_by(&:length)
14
+ sample = lexes.first
15
+ @start = sample.start
16
+ @end = sample.end
17
+ @continuations = []
18
+ @mass = lexes.map(&:length).sum
19
+ @done = false
20
+ @hopper = Hopper.new(filters, n)
21
+ @starters = starters
22
+ @do_unary_branch_check = do_unary_branch_check
23
+ @bases = Picker.new
24
+ lexes.each do |p|
25
+ @bases << Iterator.new(p, @hopper, starters, do_unary_branch_check)
26
+ end
27
+ end
28
+
29
+ def total_parses
30
+ if @hopper.size.zero?
31
+ 0
32
+ elsif continuations.any?
33
+ continuations.map { |c| c.total_parses * @hopper.size }.sum
34
+ else
35
+ @hopper.size
36
+ end
37
+ end
38
+
39
+ # used to pick the next segment to iterate
40
+ def weight
41
+ @mass * @hopper.size
42
+ end
43
+
44
+ # try to add one parse to the hopper
45
+ # returns whether or not this succeeded
46
+ def next
47
+ return false if @done
48
+
49
+ start_size = @hopper.size
50
+ catch :done do
51
+ while (iterator = @bases.pop)
52
+ unless @hopper.continuable?(iterator.parse)
53
+ @hopper << iterator.parse
54
+ throw :done if @hopper.satisfied?
55
+
56
+ next
57
+ end
58
+
59
+ if (p = iterator.next)
60
+ @bases << iterator
61
+ @bases << Iterator.new(p, @hopper, @starters, @do_unary_branch_check)
62
+ elsif iterator.never_returned_any?
63
+ # it looks this iterator was based on an unreducible parse
64
+ @hopper << iterator.parse
65
+ throw :done if @hopper.satisfied?
66
+ end
67
+ end
68
+ end
69
+ end_size = @hopper.size
70
+ if end_size == start_size
71
+ @done = true
72
+ false
73
+ else
74
+ true
75
+ end
76
+ end
77
+
78
+ def results
79
+ @results ||= if continuations.any?
80
+ @hopper.dump.flat_map do |parse|
81
+ continuations.flat_map do |c|
82
+ c.results.flat_map do |p|
83
+ parse.merge(p)
84
+ end
85
+ end
86
+ end
87
+ else
88
+ @hopper.dump
89
+ end
90
+ end
91
+ end
92
+ end