gullah 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ ##
5
+ # A parse is the collection of root nodes produced by parsing a text.
6
+ #
7
+ # class Example
8
+ # extend Gullah
9
+ #
10
+ # rule :S, 'NP VP'
11
+ # rule :NP, 'D N'
12
+ # rule :VP, 'V'
13
+ #
14
+ # leaf :D, /the/
15
+ # leaf :N, /cat/
16
+ # leaf :V, /sat/
17
+ # end
18
+ #
19
+ # parses = Example.parse 'the cat sat', n: 1
20
+ #
21
+ # # this is a Parse
22
+ # parse = parses.first
23
+ # puts parse.length # => 1
24
+ # puts parse.size # => 8
25
+ # puts parse.summary # => S[NP[D,_ws,N],_ws,VP[V]]
26
+ #
27
+ class Parse
28
+ ##
29
+ # The root nodes of all subtrees found in this parse in sequence. This is an array.
30
+ attr_reader :roots
31
+
32
+ # The text parsed by this parse.
33
+ attr_reader :text
34
+
35
+ # A concise stringification of the syntactic structure of this parse.
36
+ # For a given string and grammar all the parses will have a unique
37
+ # summary.
38
+ attr_reader :summary
39
+
40
+ def initialize(text) # :nodoc:
41
+ @roots = []
42
+ @text = text
43
+ end
44
+
45
+ # produce a clone of this parse with a new node with the given offsets and rule
46
+ def add(s, e, rule, loop_check, trash = false, boundary = false) # :nodoc:
47
+ clone.tap do |b|
48
+ b._roots = roots.map(&:clone)
49
+ cz = if trash
50
+ Trash
51
+ elsif boundary
52
+ Boundary
53
+ else
54
+ Node
55
+ end
56
+ n = cz.new(b, s, e, rule)
57
+ return nil if loop_check && n._loop_check?
58
+
59
+ if n.leaf?
60
+ b.roots << n
61
+ else
62
+ b.roots[s...e] = [n]
63
+ end
64
+ end
65
+ end
66
+
67
+ ##
68
+ # The number of root nodes in this parse. This is *not* the same as size.
69
+ def length
70
+ roots.length
71
+ end
72
+
73
+ ##
74
+ # The total number of nodes in this parse. This is *not* the same as length.
75
+ def size
76
+ @size ||= roots.sum(&:size)
77
+ end
78
+
79
+ ##
80
+ # The count of nodes that failed some test. Structure tests mark both the child
81
+ # and the ancestor node where the test was run as erroneous,
82
+ # so they will increase the +incorrectness_count+ by 2.
83
+ def incorrectness_count
84
+ @incorrectness_count ||= roots.select(&:failed?).count
85
+ end
86
+
87
+ ##
88
+ # The count of nodes which have some structure test which was never
89
+ # successfully run.
90
+ def pending_count
91
+ @pending_count ||= roots.select(&:pending_tests?).count
92
+ end
93
+
94
+ ##
95
+ # Are there any nodes in the parse that are erroneous, either because
96
+ # some test failed or because they correspond to "trash" -- characters
97
+ # that matched no leaf rule?
98
+ def errors?
99
+ incorrectness_count.positive?
100
+ end
101
+
102
+ ##
103
+ # Are all leaves accounted for without errors and have all tests passed?
104
+ def success?
105
+ !errors? && roots.all? { |n| n.ignorable? || n.nonterminal? && !n.pending_tests? }
106
+ end
107
+
108
+ ##
109
+ # Not a +success?+
110
+ def failure?
111
+ !success?
112
+ end
113
+
114
+ # a simplified representation for debugging
115
+ # "so" = "significant only"
116
+ def dbg(so: false)
117
+ roots.map { |n| n.dbg so: so }
118
+ end
119
+
120
+ ##
121
+ # return an enumeration of all the nodes in the parse.
122
+ #
123
+ # parses = Grammar.parse "this grammar uses the usual whitespace rule"
124
+ #
125
+ # parses.first.nodes.select { |n| n.name == :_ws }.count # => 6
126
+ def nodes
127
+ NodeIterator.new self
128
+ end
129
+
130
+ def clone # :nodoc:
131
+ super.tap do |c|
132
+ %i[@summary @size @incorrectness_count @pending_count].each do |v|
133
+ c.remove_instance_variable v if c.instance_variable_defined?(v)
134
+ end
135
+ end
136
+ end
137
+
138
+ ##
139
+ # The start offset of the first leaf in the parse.
140
+ def start
141
+ roots.first.start
142
+ end
143
+
144
+ ##
145
+ # The end offset of the last leaf in the parse.
146
+ def end
147
+ roots.last.end
148
+ end
149
+
150
+ ## ADVISORILY PRIVATE
151
+
152
+ # :stopdoc:
153
+
154
+ # for debugging
155
+ def own_text
156
+ text[start...self.end]
157
+ end
158
+
159
+ # make a new parse whose first part is this parse's nodes and whose
160
+ # second part is the later parse's nodes
161
+ def merge(later)
162
+ self.class.new(text).tap do |merged|
163
+ merged._roots = roots + later.roots
164
+ end
165
+ end
166
+
167
+ # split the parse into segments and boundaries
168
+ def split
169
+ last_index = 0
170
+ splits = []
171
+
172
+ # look for traversible sequences and boundaries
173
+ roots.each_with_index do |n, i|
174
+ next if n.traversible?
175
+
176
+ if i > last_index
177
+ # sometimes you can have two boundaries in a row,
178
+ # or you can begin with a boundary
179
+ segment = Parse.new text
180
+ segment._roots = roots[last_index...i]
181
+ splits << segment.initialize_summaries
182
+ end
183
+
184
+ # create boundary element
185
+ segment = Parse.new text
186
+ segment._roots = [n]
187
+ splits << segment.initialize_summaries
188
+ last_index = i + 1
189
+ end
190
+ return [initialize_summaries] if last_index.zero?
191
+
192
+ if last_index < roots.length
193
+ segment = Parse.new text
194
+ segment._roots = roots[last_index...roots.length]
195
+ splits << segment.initialize_summaries
196
+ end
197
+ splits
198
+ end
199
+
200
+ def _roots=(roots)
201
+ @roots = roots
202
+ end
203
+
204
+ # it would be conceptually simpler to lazily initialize the summary, but this
205
+ # gives us a speed boost
206
+ def initialize_summaries
207
+ @summary = roots.each { |n| n._summary = n.name unless n.summary }.map(&:summary).join(';')
208
+ self
209
+ end
210
+
211
+ def _summary=(str)
212
+ @summary = str
213
+ end
214
+
215
+ class NodeIterator # :nodoc:
216
+ include Enumerable
217
+
218
+ def initialize(parse)
219
+ @parse = parse
220
+ end
221
+
222
+ def each(&block)
223
+ @parse.roots.each do |root|
224
+ root.subtree.each(&block)
225
+ end
226
+ end
227
+
228
+ def last
229
+ @parse.roots.last.leaves.last
230
+ end
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a Picker keeps a sorted set of iterators so we can always pick the iterator
5
+ # most likely to lead quickly to a satisfactory parse
6
+ class Picker # :nodoc:
7
+ def initialize
8
+ # a sorted list of the
9
+ @error_counts = []
10
+ @error_lists = []
11
+ @size_count_list = []
12
+ end
13
+
14
+ # add an iterator
15
+ def <<(iterator)
16
+ e_idx = iterator.errors
17
+ s_idx = iterator.length
18
+ e_list = @error_lists[e_idx] ||= []
19
+ list = e_list[s_idx] ||= []
20
+ sc_list = @size_count_list[e_idx] ||= []
21
+ if (i = @error_counts.bsearch_index { |c| c >= e_idx })
22
+ # *may* have to add this error count
23
+ @error_counts.insert i, e_idx if @error_counts[i] != e_idx
24
+ else
25
+ # this is a bigger error count than we currently have
26
+ @error_counts << e_idx
27
+ end
28
+ if (i = sc_list.bsearch_index { |c| c >= s_idx })
29
+ # *may* have to add this size
30
+ sc_list.insert i, s_idx if sc_list[i] != s_idx
31
+ else
32
+ # this size is bigger than we currently have for this error count
33
+ sc_list << s_idx
34
+ end
35
+ # finally, we stow the iterator
36
+ list << iterator
37
+ end
38
+
39
+ # remove the best iterator
40
+ def pop
41
+ error_idx = @error_counts.first
42
+ return nil unless error_idx
43
+
44
+ error_list = @error_lists[error_idx]
45
+ size_idx = @size_count_list[error_idx].first
46
+ size_list = error_list[size_idx]
47
+ iterator = size_list.pop
48
+ # remove indices if they're used up
49
+ if size_list.empty?
50
+ @size_count_list[error_idx].shift
51
+ @error_counts.shift if @size_count_list[error_idx].empty?
52
+ end
53
+ iterator
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a non-terminal grammatical rule
5
+ class Rule # :nodoc:
6
+ # name -- a symbol identifying the rule
7
+ # body -- preserved for debugging
8
+ # tests -- tests that must be run after a match to determine whether the node is a keeper
9
+ # ancestor_tests -- tests that must be run after an ancestor match
10
+ # subrules/atoms -- if you have no subrules, you have a sequence of atoms
11
+ attr_reader :name, :body, :tests, :ancestor_tests, :subrules, :atoms, :preconditions
12
+
13
+ def initialize(name, body, tests: [], preconditions: [])
14
+ @name = name
15
+ @body = body
16
+ @tests = tests
17
+ @preconditions = preconditions
18
+ if body =~ /\|/
19
+ @subrules = body.split(/ ?\| ?/).map do |subrule|
20
+ Rule.new(name, subrule, tests: tests)
21
+ end
22
+ else
23
+ @atoms = body.split(/ /).map do |a|
24
+ Atom.new(a, self)
25
+ end
26
+ @atoms.each_with_index do |a, i|
27
+ a._next = @atoms[i + 1]
28
+ end
29
+ end
30
+ end
31
+
32
+ # the subrules that may start a match and their atoms
33
+ def starters
34
+ if subrules
35
+ subrules.flat_map(&:starters)
36
+ else
37
+ ar = []
38
+ atoms.each do |a|
39
+ ar << [a.seeking, a]
40
+ break if a.required?
41
+ end
42
+ ar
43
+ end
44
+ end
45
+
46
+ # could this rule participate in a loop?
47
+ def potentially_unary?
48
+ if subrules
49
+ subrules.any?(&:potentially_unary?)
50
+ else
51
+ atoms.sum(&:min_repeats) < 2
52
+ end
53
+ end
54
+
55
+ # collect all links from a sought symbol to the new name
56
+ # used in testing for potential infinite loops
57
+ def branches
58
+ if subrules
59
+ subrules.select(&:potentially_unary?).flat_map(&:branches)
60
+ else
61
+ atoms.map { |a| [a.seeking, name] }
62
+ end
63
+ end
64
+
65
+ # collect all the different rules some atom of this rule might match
66
+ def seeking
67
+ if subrules
68
+ subrules.flat_map(&:seeking).uniq
69
+ else
70
+ atoms.map(&:seeking).uniq
71
+ end
72
+ end
73
+
74
+ # obtain all the literals required by this rule
75
+ def literals
76
+ if subrules
77
+ subrules.flat_map(&:literals).uniq
78
+ else
79
+ atoms.select(&:literal).map(&:seeking).uniq
80
+ end
81
+ end
82
+
83
+ ## ADVISORILY PRIVATE
84
+
85
+ def _post_init(tests, preconditions)
86
+ @tests, @ancestor_tests = tests.partition { |m| m.arity == 1 }
87
+ @preconditions = preconditions
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Gullah
4
+ # a segment handles the portion of a string between boundaries
5
+ # or a boundary itself
6
+ class Segment # :nodoc:
7
+ attr_reader :start, :end, :done
8
+ attr_accessor :continuations
9
+
10
+ def initialize(lexes, filters, starters, do_unary_branch_check, n)
11
+ # the lexes are all parses that begin and end at the same character offsets
12
+ # the one with the fewest root nodes is most likely to be correct
13
+ lexes = lexes.sort_by(&:length)
14
+ sample = lexes.first
15
+ @start = sample.start
16
+ @end = sample.end
17
+ @continuations = []
18
+ @mass = lexes.map(&:length).sum
19
+ @done = false
20
+ @hopper = Hopper.new(filters, n)
21
+ @starters = starters
22
+ @do_unary_branch_check = do_unary_branch_check
23
+ @bases = Picker.new
24
+ lexes.each do |p|
25
+ @bases << Iterator.new(p, @hopper, starters, do_unary_branch_check)
26
+ end
27
+ end
28
+
29
+ def total_parses
30
+ if @hopper.size.zero?
31
+ 0
32
+ elsif continuations.any?
33
+ continuations.map { |c| c.total_parses * @hopper.size }.sum
34
+ else
35
+ @hopper.size
36
+ end
37
+ end
38
+
39
+ # used to pick the next segment to iterate
40
+ def weight
41
+ @mass * @hopper.size
42
+ end
43
+
44
+ # try to add one parse to the hopper
45
+ # returns whether or not this succeeded
46
+ def next
47
+ return false if @done
48
+
49
+ start_size = @hopper.size
50
+ catch :done do
51
+ while (iterator = @bases.pop)
52
+ unless @hopper.continuable?(iterator.parse)
53
+ @hopper << iterator.parse
54
+ throw :done if @hopper.satisfied?
55
+
56
+ next
57
+ end
58
+
59
+ if (p = iterator.next)
60
+ @bases << iterator
61
+ @bases << Iterator.new(p, @hopper, @starters, @do_unary_branch_check)
62
+ elsif iterator.never_returned_any?
63
+ # it looks this iterator was based on an unreducible parse
64
+ @hopper << iterator.parse
65
+ throw :done if @hopper.satisfied?
66
+ end
67
+ end
68
+ end
69
+ end_size = @hopper.size
70
+ if end_size == start_size
71
+ @done = true
72
+ false
73
+ else
74
+ true
75
+ end
76
+ end
77
+
78
+ def results
79
+ @results ||= if continuations.any?
80
+ @hopper.dump.flat_map do |parse|
81
+ continuations.flat_map do |c|
82
+ c.results.flat_map do |p|
83
+ parse.merge(p)
84
+ end
85
+ end
86
+ end
87
+ else
88
+ @hopper.dump
89
+ end
90
+ end
91
+ end
92
+ end