gullah 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/LICENSE +21 -0
- data/README.md +87 -0
- data/Rakefile +11 -0
- data/TODO.md +2 -0
- data/examples/hat.rb +27 -0
- data/examples/trash.rb +42 -0
- data/examples/xml.rb +45 -0
- data/gullah.gemspec +31 -0
- data/lib/gullah/atom.rb +132 -0
- data/lib/gullah/boundary.rb +11 -0
- data/lib/gullah/dotifier.rb +127 -0
- data/lib/gullah/error.rb +7 -0
- data/lib/gullah/hopper.rb +142 -0
- data/lib/gullah/iterator.rb +67 -0
- data/lib/gullah/leaf.rb +24 -0
- data/lib/gullah/node.rb +553 -0
- data/lib/gullah/parse.rb +233 -0
- data/lib/gullah/picker.rb +56 -0
- data/lib/gullah/rule.rb +90 -0
- data/lib/gullah/segment.rb +92 -0
- data/lib/gullah/trash.rb +15 -0
- data/lib/gullah/version.rb +7 -0
- data/lib/gullah.rb +777 -0
- data/test/basic_test.rb +451 -0
- data/test/big_tree_test.rb +26 -0
- data/test/boundary_test.rb +29 -0
- data/test/date_test.rb +111 -0
- data/test/error_test.rb +245 -0
- data/test/json_test.rb +124 -0
- data/test/parse_demo_test.rb +33 -0
- data/test/precondition_test.rb +68 -0
- data/test/tests_per_subrule_test.rb +49 -0
- data/test/tree_walking_test.rb +88 -0
- metadata +157 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6c5345d281d3e785973c68f4c0bb05466f5563c45bce2d2e71871b0fcc5d3c4e
|
4
|
+
data.tar.gz: 7640864e5cdc6c5b788798e2364f8d72e6ff43b2eb30d57a6c7e32a549161e29
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7e5eed3d8ea6bfcc00f5594b636fc74e98c14b1efc3af447c956a869c1137e00ea9e7f2d0804cbffd1c3923e92c58cc319c5e4b6934af7a44d558e461ebef20a
|
7
|
+
data.tar.gz: f47ca345b05f9ab747b1bd2cff6d85be1773979d98bb76e9d5d0ebcb3ada5e599d2cf3c1d8cf4006ab55fc29a6399ba600ec78949b11b5dd34d5890bee5363d1
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
doc/*
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2021 David Fairchild Houghton
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
# gullah
|
2
|
+
|
3
|
+
A simple, fault-tolerant bottom-up parser written in Ruby.
|
4
|
+
|
5
|
+
# Synopsis
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
class Cat
|
9
|
+
extend Gullah
|
10
|
+
|
11
|
+
rule :S, 'NP VP'
|
12
|
+
rule :NP, 'D NB'
|
13
|
+
rule :NB, 'A* N'
|
14
|
+
rule :VP, 'VP PP'
|
15
|
+
rule :VP, 'V'
|
16
|
+
rule :PP, 'P NP'
|
17
|
+
rule :P, 'prepositions'
|
18
|
+
rule :V, 'verbs'
|
19
|
+
rule :D, 'determiners'
|
20
|
+
rule :N, 'nouns'
|
21
|
+
rule :A, 'adjectives'
|
22
|
+
|
23
|
+
leaf :determiners, /\b(the|a)\b/i
|
24
|
+
leaf :nouns, /\b(cat|mat)\b/i
|
25
|
+
leaf :prepositions, /\b(on|in|around|above|beside)\b/i
|
26
|
+
leaf :verbs, /\b(sat|slept|moped)\b/
|
27
|
+
leaf :adjectives, /\b(big|small|hairy|bald)\b/i
|
28
|
+
|
29
|
+
ignore :whatever, /[^\w\s]+/
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_cat
|
33
|
+
parses = Cat.parse 'The fat cat sat on the mat.'
|
34
|
+
assert_equal 1, parses.length, 'there is only one parse of this sentence'
|
35
|
+
parse = parses.first
|
36
|
+
assert_equal 1, parse.nodes.reject(&:ignorable?).count, 'there is a root node for this parse'
|
37
|
+
root = parse.nodes.first
|
38
|
+
assert_equal :S, root.name, 'the root node is a sentence'
|
39
|
+
verb = root.descendants.find { |d| d.name == :VP }&.descendants&.find { |d| d.name == :V }
|
40
|
+
assert_equal 'sat', verb&.text, 'we have the expected verb'
|
41
|
+
end
|
42
|
+
```
|
43
|
+
|
44
|
+
# What is this?
|
45
|
+
|
46
|
+
A parser takes a string representing some structured data -- a sentence in a natural language, say, or a data structure, or a program in some programming language -- and a set of rules defining the possible structures in this data and it returns an object representing the structured data.
|
47
|
+
|
48
|
+
A top-down parser requires some root rule that all data structures obeying these rules will obey. A bottom-up parser says for a given piece of data what rules it may participate in. A top-down parser in effect compiles into a state machine similar to a regular expression that represents all ways a string may obey its rules. It in effect constructs a parsing plan and tries to match this plan to the string. A bottom-up parser begins planning when it sees the data. It looks at the first thing it is given to match and creates a plan for matching it and whatever may follow.
|
49
|
+
|
50
|
+
The important difference is that a top-down parser must have a single root element. A bottom-up parser takes what is given and reduces it to a set of root symbols. It need not have a common symbol that must be at the root of all parses.
|
51
|
+
|
52
|
+
# Why?
|
53
|
+
|
54
|
+
I made Gullah because it seemed like a fun project. I have written several parsing-related things
|
55
|
+
for several languages. I have written a [top-down parser](https://github.com/dfhoughton/Grammar) and a [non-recursive top-down parser](https://github.com/dfhoughton/pidgin). I thought I'd try a bottom-up parser. I have no particular use for it, but I often find I want to parse things, so maybe a use will show up.
|
56
|
+
|
57
|
+
# Should I use this?
|
58
|
+
|
59
|
+
Well, it's pretty easy to use, but if you have a bespoke parser for a particular unambiguous language, that will almost certainly be much faster. An XML parser can parse XML in linear time. Because Gullah is looking for errors and ambiguity it will consider lots of alternative deadend permutations that a SAX parser, say, will skip. Don't write a new JSON parser in Gullah, in other words. But if you want to play with natural language, or you have some toy language or small spec you're working with, Gullah can get you going quickly. Maybe it will suffice for all your needs!
|
60
|
+
|
61
|
+
Gullah will give you its best parses of your string even if it is ungrammatical. Also, Gullah makes it easy to add arbitrary conditions on rules that another parser might not. For instance, in Gullah you can specify arbitrary-width lookarounds for a rule -- `foo` must be preceded/followed by `bar` and some number of whitespaces -- and you can define other long-distance dependences -- "runs" must have a singular subject, "viejas" must be modifying a feminine plural noun. For more on this see the documentation of node tests, ancestor tests, and preconditions in the `Gullah` module.
|
62
|
+
|
63
|
+
# Name
|
64
|
+
|
65
|
+
[Gullah](https://en.wikipedia.org/wiki/Gullah_language) is a [creole](https://en.wikipedia.org/wiki/Gullah_language)
|
66
|
+
spoken on the barrier islands off the coast of the Carolinas and Georgia. I wanted to call this gem "creole" because I've
|
67
|
+
written a more impoverished parser called [pidgin](https://github.com/dfhoughton/pidgin). A
|
68
|
+
[pidgin](https://en.wikipedia.org/wiki/Pidgin) is a somewhat impoverished language created as a minimal medium
|
69
|
+
of communication between two groups without a common language. A creole is a complete language created from a pidgin
|
70
|
+
when children adopt it as their primary language. The pidgin library I wrote is minimal in that it cannot handle
|
71
|
+
recursive structures. I wanted to create a better parser that could handle all the complexity of natural (or artificial)
|
72
|
+
languages. Since this was an evolution from pidgin, I wanted to call it creole.
|
73
|
+
|
74
|
+
Well, "creole" was taken. So I chose among the names of creoles of I knew of. Gullah is a creole of English and various
|
75
|
+
Central and West African languages. I thought the name "Gullah" was cool and I like the way Gullah sounds, so I picked "Gullah".
|
76
|
+
|
77
|
+
I hope this causes no offense to speakers of Gullah.
|
78
|
+
|
79
|
+
# Future
|
80
|
+
|
81
|
+
Because Gullah is designed to handle ambiguous grammars and erroneous data, it can produce many parses for a given string. Right now you can ask for all parses (maybe very slow), or at least `n` equally good parses. This makes the API a little complicated and noisy. I have begun work rewriting it to return a lazy enumeration of parses. Ruby's `Enumerable` magic is nice. I haven't finished this, though, and I might not, but it may be that some future version of this will change the API dramatically.
|
82
|
+
|
83
|
+
Another possibility, if I have sufficient spare time and am sufficiently ambitious, is that I may refactor the algorithm to use ractors. Certain parts of the algorithm are a natural fit for parallelization. We shall see.
|
84
|
+
|
85
|
+
# Acknowledgements
|
86
|
+
|
87
|
+
I would like to thank my family and co-workers for tolerating me saying "Gullah" much more often than any of them expected.
|
data/Rakefile
ADDED
data/TODO.md
ADDED
data/examples/hat.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'gullah'
|
4
|
+
|
5
|
+
# :stopdoc:
|
6
|
+
|
7
|
+
class Englishish
|
8
|
+
extend Gullah
|
9
|
+
|
10
|
+
rule :NP, 'NP PP'
|
11
|
+
rule :NP, 'D N'
|
12
|
+
rule :NP, 'Proper'
|
13
|
+
rule :D, 'the | Possessive'
|
14
|
+
rule :PP, 'prep NP'
|
15
|
+
rule :Possessive, 'NP pe'
|
16
|
+
|
17
|
+
leaf :the, /\bthe\b/i
|
18
|
+
leaf :pe, /(?<=[a-rt-z])'s|(?<=s)'/i
|
19
|
+
leaf :Proper, /\bE(?i)ngland\b/
|
20
|
+
leaf :N, /\b(?:queen|hat)\b/i
|
21
|
+
leaf :prep, /\bof\b/i
|
22
|
+
end
|
23
|
+
|
24
|
+
Englishish.parse("the queen of England's hat").each_with_index do |parse, i|
|
25
|
+
puts parse.summary
|
26
|
+
Gullah::Dotifier.dot parse, "hat#{i}", make_it: :so
|
27
|
+
end
|
data/examples/trash.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# :stopdoc:
|
4
|
+
|
5
|
+
require 'gullah'
|
6
|
+
|
7
|
+
# some random rules to experiment with and where
|
8
|
+
# it's easy to make trash -- !@#$%@#$ is trash, for instance
|
9
|
+
class Sanitation
|
10
|
+
extend Gullah
|
11
|
+
|
12
|
+
rule :ping, 'foo bar | bar foo'
|
13
|
+
rule :pang, 'bar baz+ | plugh'
|
14
|
+
rule :pong, 'foo{2,} | bar{2,} | baz{2,} | plugh{2,}'
|
15
|
+
rule :peng, 'ping | pang | pong'
|
16
|
+
rule :pung, 'peng+'
|
17
|
+
|
18
|
+
leaf :foo, /\b\w\b/
|
19
|
+
leaf :bar, /\b\w{2}\b/
|
20
|
+
leaf :baz, /\b\w{3}\b/
|
21
|
+
leaf :plugh, /\b\w{4,}\b/
|
22
|
+
|
23
|
+
boundary :stop, /[.!?;:]/
|
24
|
+
end
|
25
|
+
|
26
|
+
text = <<-PROFUNDITY
|
27
|
+
A Riddle (somewhat German)
|
28
|
+
|
29
|
+
The beginning of Eternity.
|
30
|
+
The end of Time and Space.
|
31
|
+
The beginning of every End.
|
32
|
+
The end of every Place.
|
33
|
+
|
34
|
+
There once was a girl who had a little curl
|
35
|
+
Right in the middle of her forehead.
|
36
|
+
And when she was good, she was very, very good
|
37
|
+
But when she was bad she was horrid!
|
38
|
+
PROFUNDITY
|
39
|
+
|
40
|
+
poetry = Sanitation.first text
|
41
|
+
puts poetry.summary
|
42
|
+
Gullah::Dotifier.dot poetry, 'poem', make_it: :so
|
data/examples/xml.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'gullah'
|
4
|
+
|
5
|
+
# :stopdoc:
|
6
|
+
|
7
|
+
class XMLish
|
8
|
+
extend Gullah
|
9
|
+
|
10
|
+
rule :root, 'element'
|
11
|
+
rule :element, 'full | empty'
|
12
|
+
rule :full, '"<" tag attribute* ">" content* "</" tag ">"', preconditions: %i[same_tag]
|
13
|
+
rule :empty, '"<" tag attribute* "/>"'
|
14
|
+
rule :content, 'element | text | entity', tests: %i[has_parent]
|
15
|
+
rule :attribute, 'tag "=" value'
|
16
|
+
rule :value, 'squote | dquote'
|
17
|
+
|
18
|
+
leaf :tag, /\b[a-z]+\b/
|
19
|
+
leaf :text, /[^&<>]+/
|
20
|
+
leaf :entity, /&(?:[lg]t|amp|[lr]dquo);/
|
21
|
+
leaf :squote, /'[^']*'/
|
22
|
+
leaf :dquote, /"[^"]*"/
|
23
|
+
|
24
|
+
def same_tag(_name, _s, _e, _text, children)
|
25
|
+
first, last = children.select { |c| c.name == :tag }
|
26
|
+
first.text == last.text
|
27
|
+
end
|
28
|
+
|
29
|
+
def has_parent(_root, _node)
|
30
|
+
:pass
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
[
|
35
|
+
'<root/>',
|
36
|
+
'<root>some text “thing” and more text</root>',
|
37
|
+
'<foo>I have</foo><bar>no root</bar>',
|
38
|
+
'<foo attibutes="!"><and><nested/></and></foo>'
|
39
|
+
# this next one is r-e-a-l-l-y s-l-o-o-o-o-o-w
|
40
|
+
# '<big i="have" some="attributes"><empty/>text<element also="attributes">with<things/>inside</element></big>'
|
41
|
+
].each_with_index do |xml, i|
|
42
|
+
parse = XMLish.parse(xml).min_by { |p| [p.length, p.size] }
|
43
|
+
puts parse.summary
|
44
|
+
Gullah::Dotifier.dot parse, "xml#{i}", make_it: :so
|
45
|
+
end
|
data/gullah.gemspec
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'gullah/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'gullah'
|
9
|
+
s.version = Gullah::VERSION
|
10
|
+
s.summary = 'A bottom up parser generator'
|
11
|
+
s.description = <<-DESC.strip.gsub(/\s+/, ' ')
|
12
|
+
Gullah is a bottom-up parser generator than can
|
13
|
+
handle errors, ambiguous syntax, and arbitrary matching
|
14
|
+
conditions.
|
15
|
+
DESC
|
16
|
+
s.authors = ['David F. Houghton']
|
17
|
+
s.email = 'dfhoughton@gmail.com'
|
18
|
+
s.homepage =
|
19
|
+
'https://rubygems.org/gems/gullah'
|
20
|
+
s.license = 'MIT'
|
21
|
+
s.required_ruby_version = '>= 2.6'
|
22
|
+
s.files = `git ls-files -z`.split("\x0")
|
23
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
24
|
+
s.require_paths = ['lib']
|
25
|
+
|
26
|
+
s.add_development_dependency 'bundler', '~> 1.7'
|
27
|
+
s.add_development_dependency 'byebug', '~> 9.1.0'
|
28
|
+
s.add_development_dependency 'json', '~> 2'
|
29
|
+
s.add_development_dependency 'minitest', '~> 5'
|
30
|
+
s.add_development_dependency 'rake', '~> 10.0'
|
31
|
+
end
|
data/lib/gullah/atom.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# a minimal rule fragment; this is where the actual matching occurs
|
5
|
+
class Atom # :nodoc:
|
6
|
+
attr_reader :seeking, :min_repeats, :max_repeats, :parent, :next, :literal
|
7
|
+
|
8
|
+
def initialize(atom, parent)
|
9
|
+
@parent = parent
|
10
|
+
rule, suffix =
|
11
|
+
/\A
|
12
|
+
(
|
13
|
+
(?:[a-zA-Z_]|\\.)(?:\w|\\.)* # decent identifier, maybe with escaped bits
|
14
|
+
|
|
15
|
+
"(?:[^"\\]|\\.)+" # double-quoted string, maybe with escaped characters
|
16
|
+
|
|
17
|
+
'(?:[^'\\]|\\.)+'' # single-quoted string, maybe with escaped characters
|
18
|
+
)
|
19
|
+
([?*+!]|\{\d+(?:,\d*)?\})? # optional repetition suffix
|
20
|
+
\z/x
|
21
|
+
.match(atom)&.captures
|
22
|
+
raise Error, "cannot parse #{atom}" unless rule
|
23
|
+
|
24
|
+
@literal = rule[0] =~ /['"]/
|
25
|
+
@seeking = clean(rule).to_sym
|
26
|
+
|
27
|
+
if suffix
|
28
|
+
case suffix[0]
|
29
|
+
when '?'
|
30
|
+
@min_repeats = 0
|
31
|
+
@max_repeats = 1
|
32
|
+
when '+'
|
33
|
+
@min_repeats = 1
|
34
|
+
@max_repeats = Float::INFINITY
|
35
|
+
when '*'
|
36
|
+
@min_repeats = 0
|
37
|
+
@max_repeats = Float::INFINITY
|
38
|
+
else
|
39
|
+
min, comma, max = /(\d+)(?:(,)(\d+)?)?/.match(suffix).captures
|
40
|
+
min = min.to_i
|
41
|
+
@min_repeats = min
|
42
|
+
if comma
|
43
|
+
if max
|
44
|
+
max = max.to_i
|
45
|
+
raise Error, "cannot parse #{atom}: #{min} is greater than #{max}" if max < min
|
46
|
+
|
47
|
+
@max_repeats = max
|
48
|
+
else
|
49
|
+
@max_repeats = Float::INFINITY
|
50
|
+
end
|
51
|
+
else
|
52
|
+
@max_repeats = min
|
53
|
+
end
|
54
|
+
end
|
55
|
+
else
|
56
|
+
@min_repeats = @max_repeats = 1
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# whether this atom must match at least once
|
61
|
+
def required?
|
62
|
+
min_repeats.positive?
|
63
|
+
end
|
64
|
+
|
65
|
+
# returns the new offset, or nil if the atom doesn't match
|
66
|
+
def match(nodes, offset)
|
67
|
+
if offset >= nodes.length
|
68
|
+
return min_repeats.zero? ? offset : nil
|
69
|
+
end
|
70
|
+
|
71
|
+
count = 0
|
72
|
+
nodes[offset...nodes.length].each_with_index do |n, i|
|
73
|
+
next if n.ignorable?
|
74
|
+
|
75
|
+
return returnable(nodes, i + offset + 1) if count == max_repeats
|
76
|
+
|
77
|
+
if n.traversible? && n.name == seeking
|
78
|
+
count += 1
|
79
|
+
return returnable(nodes, i + offset + 1) if count == max_repeats
|
80
|
+
|
81
|
+
next
|
82
|
+
end
|
83
|
+
|
84
|
+
return count >= min_repeats ? returnable(nodes, i + offset) : nil
|
85
|
+
end
|
86
|
+
count < min_repeats ? nil : returnable(nodes, nodes.length) # all nodes were consumed
|
87
|
+
end
|
88
|
+
|
89
|
+
# used to order rules so greedier ones go first
|
90
|
+
def max_consumption
|
91
|
+
@max_consumption ||= begin
|
92
|
+
augment = max_repeats == Float::INFINITY ? 10 : max_repeats
|
93
|
+
self.next&.max_consumption.to_i + augment
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
## ADVISORILY PRIVATE
|
98
|
+
|
99
|
+
def _next=(nxt)
|
100
|
+
@next = nxt
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
def returnable(nodes, offset)
|
106
|
+
if self.next
|
107
|
+
self.next.match(nodes, offset)
|
108
|
+
else
|
109
|
+
offset
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# remove quotes and escapes
|
114
|
+
def clean(str)
|
115
|
+
str = str[1...(str.length - 1)] if literal
|
116
|
+
escaped = false
|
117
|
+
cleaned = ''
|
118
|
+
(0...str.length).each do |i|
|
119
|
+
c = str[i]
|
120
|
+
if escaped
|
121
|
+
cleaned += c
|
122
|
+
escaped = false
|
123
|
+
elsif c == '\\'
|
124
|
+
escaped = true
|
125
|
+
else
|
126
|
+
cleaned += c
|
127
|
+
end
|
128
|
+
end
|
129
|
+
cleaned
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gullah
|
4
|
+
# A little tool to help visualize a parse tree. It generates .dot files
|
5
|
+
# parsable by graphviz. If you have graphviz installed, you may be able
|
6
|
+
# to invoke it like so and generate a .png file
|
7
|
+
#
|
8
|
+
# Gullah::Dotifier.dot parses.first, "tree", make_it: :so
|
9
|
+
#
|
10
|
+
# This will generate a file called tree.png showing the parse tree. If you
|
11
|
+
# don't have graphviz, or perhaps if you're on a machine which doesn't like
|
12
|
+
# the command this generates -- I suspect Windows doesn't -- you can skip
|
13
|
+
# the named argument and just generate the dot file which you can feed into
|
14
|
+
# graphviz some other way.
|
15
|
+
#
|
16
|
+
# I make no guarantees about this utility. You may want to build your own,
|
17
|
+
# in which case this can serve as a simple prototype.
|
18
|
+
class Dotifier
|
19
|
+
##
|
20
|
+
# Receives a parse and a file name and generates a graph specification
|
21
|
+
# readable by graphviz. The specification is written to a file with the
|
22
|
+
# specified file name. If +make_it+ is truthy, the +dot+ command will
|
23
|
+
# also be invoked and the graph image generated. By default this will be
|
24
|
+
# a png, though the type is specifiable via the +type+ named argument.
|
25
|
+
def self.dot(parse, file, make_it: false, type: 'png')
|
26
|
+
new.send :dot, parse, file, make_it, type
|
27
|
+
end
|
28
|
+
|
29
|
+
# making the guts private to simplify the API
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def dot(parse, file, make_it, type)
|
34
|
+
@edges = {}
|
35
|
+
File.open file, 'w' do |f|
|
36
|
+
f.puts 'graph {'
|
37
|
+
f.puts "\tnode[shape=none]"
|
38
|
+
f.puts
|
39
|
+
parse.roots.each do |root|
|
40
|
+
tree(root, f)
|
41
|
+
end
|
42
|
+
# put all the leaves in a row at the bottom
|
43
|
+
f.puts
|
44
|
+
f.puts "\tsubgraph {"
|
45
|
+
f.puts "\t\trank=\"same\""
|
46
|
+
parse.roots.flat_map(&:leaves).reject(&:ignorable?).each do |leaf|
|
47
|
+
f.puts "\t\t#{leaf_name(leaf)}"
|
48
|
+
end
|
49
|
+
f.puts "\t}"
|
50
|
+
f.puts '}'
|
51
|
+
end
|
52
|
+
`dot -T#{type} -o#{file}.#{type} #{file}` if make_it
|
53
|
+
end
|
54
|
+
|
55
|
+
def tree(node, f)
|
56
|
+
return if node.ignorable?
|
57
|
+
|
58
|
+
nn = name(node)
|
59
|
+
f.puts "\t#{nn} #{node_attributes(node)}"
|
60
|
+
if node.leaf?
|
61
|
+
ln = leaf_name(node)
|
62
|
+
f.puts "\t#{ln} [label=#{node.text.inspect}]"
|
63
|
+
f.puts "\t#{nn} -- #{ln}"
|
64
|
+
end
|
65
|
+
Array(node.atts[:satisfied_ancestor]).each do |_, loc, *|
|
66
|
+
child = node.find loc
|
67
|
+
add_edge node, child, :success, true
|
68
|
+
end
|
69
|
+
Array(node.atts[:failed_ancestor]).each do |_, loc, *|
|
70
|
+
child = node.find loc
|
71
|
+
add_edge node, child, :error, true
|
72
|
+
end
|
73
|
+
Array(node.children&.reject(&:ignorable?)).each do |child|
|
74
|
+
f.puts "\t#{nn} -- #{name(child)}#{edge_attributes node, child}"
|
75
|
+
tree(child, f)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def add_edge(parent, child, property, value)
|
80
|
+
while parent != child
|
81
|
+
middle = parent.children.find { |c| c.contains? child.start }
|
82
|
+
(@edges[[parent.position, middle.position]] ||= {})[property] = value
|
83
|
+
parent = middle
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def edge_attributes(node, child)
|
88
|
+
atts = []
|
89
|
+
if (properties = @edges[[node.position, child.position]])
|
90
|
+
if properties[:error]
|
91
|
+
atts << 'color=red'
|
92
|
+
elsif properties[:success]
|
93
|
+
atts << 'color=green'
|
94
|
+
end
|
95
|
+
end
|
96
|
+
" [#{atts.join(';')}]" if atts.any?
|
97
|
+
end
|
98
|
+
|
99
|
+
def node_attributes(node)
|
100
|
+
atts = ["label=#{node.trash? ? 'trash' : node.name.to_s.inspect}"]
|
101
|
+
if node.trash?
|
102
|
+
atts << 'color=red'
|
103
|
+
atts << 'shape=box'
|
104
|
+
elsif node.boundary?
|
105
|
+
atts << 'color=blue'
|
106
|
+
atts << 'shape=box'
|
107
|
+
elsif node.error?
|
108
|
+
atts << 'color=red'
|
109
|
+
atts << 'shape=oval'
|
110
|
+
elsif node.atts[:satisfied_ancestor] || node.atts[:satisfied_descendant]
|
111
|
+
atts << 'color=green'
|
112
|
+
atts << 'shape=oval'
|
113
|
+
end
|
114
|
+
"[#{atts.join(';')}]"
|
115
|
+
end
|
116
|
+
|
117
|
+
def name(node)
|
118
|
+
offset, height = node.position
|
119
|
+
"n_#{offset}_#{height}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def leaf_name(node)
|
123
|
+
offset, height = node.position
|
124
|
+
"l_#{offset}_#{height}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|