grammaphone 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/grammaphone.rb +192 -0
- data/lib/grammaphone/errors.rb +29 -0
- data/lib/grammaphone/rule.rb +60 -0
- data/lib/grammaphone/tokens.rb +141 -0
- metadata +46 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: a6fc51b323812f7278f9e51540cccbc1df3530c17da6abc75f9fb7c97065433d
|
4
|
+
data.tar.gz: 3e17ab606b44b15d37445d81a73102fb2122569ecc43839c40c25498f6ebd340
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b06c48105fd3b8b3682fe66fa1ec747d779100174724dd2b17de1c316e9b6dc81ff64aedebda0c9c97fa5654c7a7bf9767d4d07d9f15bd59f7d13e354d6f977f
|
7
|
+
data.tar.gz: 492dce702c1fa07ce1258c5ff62c32bfc8c8669ec224def19fba81eb8fc7ee69b83f7ca6d8b8518ab7b4d5e491538efbe32c224cea635f80f8d3008a0ad16aee
|
data/lib/grammaphone.rb
ADDED
@@ -0,0 +1,192 @@
|
|
1
|
+
# Grammaphone is a dynamically-definable parser pseudo-generator based on a
|
2
|
+
# BNF-like grammar.
|
3
|
+
#
|
4
|
+
# ### Grammar
|
5
|
+
# A grammar is defined using key-value pairs in a hash. This is viewed as a
|
6
|
+
# names with associated sets of patterns it can match against. A name can be
|
7
|
+
# any sequence of characters that are valid in a Ruby String (i.e. any Unicode
|
8
|
+
# character). Similarly, the patterns can be composed of any Ruby-valid characters.
|
9
|
+
#
|
10
|
+
# ### Writing a Rule
|
11
|
+
# A rule is a list of element identifiers, separated by spaces (`" "` or `\x20`).
|
12
|
+
#
|
13
|
+
# Element identifiers fall into three categories: literals, patterns, and rules.
|
14
|
+
#
|
15
|
+
# A literal is a string of characters that will be matched iff the token matches
|
16
|
+
# the literal text *exactly*. Literal sequences are preceeded by one double-quote
|
17
|
+
# (`"` or `\x22`). For example, to match the exact string "hello", the literal
|
18
|
+
# string you would use is `"hello`. Note that the initial double-quote is not
|
19
|
+
# included in the literal itself.
|
20
|
+
#
|
21
|
+
# A pattern is a string of characters representing a Ruby-valid regex. Pattern
|
22
|
+
# sequences are surrounded by one forward slash (`"/"` or `\x2F`) on each end.
|
23
|
+
# For example, to match a capitalized name composed strictly of ASCII letters,
|
24
|
+
# (e.g. "April", "John", "Alex"), the pattern string could be `/[A-Z][a-z]*/`.
|
25
|
+
# Note that every pattern must strictly match the whole token. Anything else
|
26
|
+
# won't be matched.
|
27
|
+
#
|
28
|
+
# A rule identifier is a string of characters representing the name of a grammatical
|
29
|
+
# rule. Rule identifier sequences are trivial, and are specified by using the
|
30
|
+
# exact name of the rule with no decorations. For example, to reference a rule
|
31
|
+
# named "NUMBER", the Rule string is `NUMBER`. Note that, like any grammar, a rule
|
32
|
+
# can refer to itself to define a recursive pattern. If a rule name is specified
|
33
|
+
# that doesn't exist in the current grammar, an exception will be raised and
|
34
|
+
# parsing will immediately stop.
|
35
|
+
#
|
36
|
+
# A rule is composed of zero or more element identifiers, which are evaluated in
|
37
|
+
# order. If a rule has no identifiers, then it will only match an empty token
|
38
|
+
# list, which will always succeed.
|
39
|
+
#
|
40
|
+
# Multiple options for a rule can be specified by passing an Array where each
|
41
|
+
# element of the Array is a valid rule. These rules are treated as possibilities
|
42
|
+
# for matching, with a precedence specified by the order.
|
43
|
+
#
|
44
|
+
# #### Example
|
45
|
+
# The two most common introductory programs are "Hello, world!", and an
|
46
|
+
# introduction program, given a name. For the purposes of this example, the latter
|
47
|
+
# prints in the format "Hello, \<name\>!", where `<name>` is the name entered.
|
48
|
+
#
|
49
|
+
# The following Hash describes the grammar that matches the output of these programs,
|
50
|
+
# assuming they are tokenized as ["Hello", ",", " ", \<name\>/world, "!"]. That
|
51
|
+
# tokenization is not default, but is assumed for the purposes of this example.
|
52
|
+
# This is by no means the only possible grammar, just an example.
|
53
|
+
#
|
54
|
+
# ```ruby
|
55
|
+
# {
|
56
|
+
# START: '"Hello ", /\s/ NAME "!',
|
57
|
+
# NAME: ['"world', '/[A-Z][a-z]*/']
|
58
|
+
# }
|
59
|
+
# ```
|
60
|
+
#
|
61
|
+
# Note that to match a space, you need to use the pattern, since the splitting function
|
62
|
+
# for rules splits on the space character, regardless of where it is.
|
63
|
+
|
64
|
+
require_relative "grammaphone/errors"
|
65
|
+
require_relative "grammaphone/tokens"
|
66
|
+
require_relative "grammaphone/rule"
|
67
|
+
|
68
|
+
class Grammaphone
|
69
|
+
|
70
|
+
def self.tokenize(src, &split_method)
|
71
|
+
TokenStream.new(src, &split_method)
|
72
|
+
end
|
73
|
+
|
74
|
+
# node_type must accept a
|
75
|
+
def initialize(rules = {}, node_type = Array, &default_action)
|
76
|
+
raise ArgumentError.new("cannot form parser from a #{rules.class}") unless rules.kind_of? Hash
|
77
|
+
raise ArgumentError.new("syntax tree type must respond to <<") unless node_type.method_defined?(:"<<")
|
78
|
+
@default_action = (default_action.nil? ? lambda{|node, name| node} : default_action)
|
79
|
+
@node_type = node_type
|
80
|
+
@rules = rules.map do |k, v|
|
81
|
+
Rule.new(k, v, @default_action)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def add_rule(name, rule, &action)
|
86
|
+
m = @rules.find {|r| r.name == name}
|
87
|
+
action = @default_action if action.nil?
|
88
|
+
if m.nil?
|
89
|
+
@rules << Rule.new(name, rule, action)
|
90
|
+
else
|
91
|
+
m.rule = rule
|
92
|
+
m.action = action
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def rules
|
97
|
+
@rules.map{|r| [r.name, r.rule]}.to_h
|
98
|
+
end
|
99
|
+
|
100
|
+
def parse(token_stream)
|
101
|
+
token_stream = TokenStream.new(token_stream) unless token_stream.kind_of?(TokenStream)
|
102
|
+
raise EmptyRulesetError if @rules.size == 0
|
103
|
+
res = self.send(@rules[0].name, token_stream, @node_type)
|
104
|
+
res
|
105
|
+
end
|
106
|
+
|
107
|
+
# Not to be released in shipped version
|
108
|
+
def test(name, token_stream)
|
109
|
+
self.send(name, TokenStream.new(token_stream))
|
110
|
+
end
|
111
|
+
|
112
|
+
def respond_to_missing?(m, include_all)
|
113
|
+
(include_all && @rules.any?{|r| r.name == m}) || super
|
114
|
+
end
|
115
|
+
|
116
|
+
# This is fun, but it doesn't really take advantage of metaprogramming in a way
|
117
|
+
# that can't be accomplished with match_rule. It also lets the rules be "called"
|
118
|
+
# outside of normal context
|
119
|
+
def method_missing(m, *args, &block)
|
120
|
+
r = @rules.find{|r| r.name == m}
|
121
|
+
if r
|
122
|
+
match_rule(r, args[0], args[1])
|
123
|
+
else
|
124
|
+
super
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
|
130
|
+
def match_rule(r, stream, result_type)
|
131
|
+
# This is an enormous function. It needs to be pared down
|
132
|
+
matches = nil
|
133
|
+
result = result_type.new
|
134
|
+
r.each do |option|
|
135
|
+
tokens = stream.dup
|
136
|
+
break if option.empty?
|
137
|
+
matched = true
|
138
|
+
|
139
|
+
option.each do |element|
|
140
|
+
token = tokens.peek
|
141
|
+
# puts "rule: #{r.name}; element: #{element}; token: #{token}"
|
142
|
+
if Token.literal?(element)
|
143
|
+
unless Token.matches_literal?(element, token)
|
144
|
+
matches = nil
|
145
|
+
matched = false
|
146
|
+
break
|
147
|
+
end
|
148
|
+
|
149
|
+
matches ||= []
|
150
|
+
matches << token
|
151
|
+
result << token
|
152
|
+
tokens.next # might as well be tokens.skip
|
153
|
+
elsif Token.pattern?(element)
|
154
|
+
unless Token.matches_pattern?(element, token)
|
155
|
+
matches = nil
|
156
|
+
matched = false
|
157
|
+
break
|
158
|
+
end
|
159
|
+
|
160
|
+
matches ||= []
|
161
|
+
unless token.nil?
|
162
|
+
matches << token
|
163
|
+
result << token
|
164
|
+
end
|
165
|
+
tokens.next
|
166
|
+
else
|
167
|
+
raise TokenError.new("Can't have empty patterns") if element.empty?
|
168
|
+
|
169
|
+
submatches, res = self.send(element, tokens, result_type)
|
170
|
+
unless submatches
|
171
|
+
matches = nil
|
172
|
+
matched = false
|
173
|
+
break
|
174
|
+
end
|
175
|
+
|
176
|
+
matches ||= []
|
177
|
+
matches << submatches
|
178
|
+
result << res
|
179
|
+
tokens.skip([submatches.size, 1].max)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
if matched
|
184
|
+
result = r.trigger(result)
|
185
|
+
break
|
186
|
+
end
|
187
|
+
end
|
188
|
+
# puts "matches for rule #{r.name}: #{matches.to_s}" unless matches.nil?
|
189
|
+
return false if matches.nil?
|
190
|
+
[matches, result]
|
191
|
+
end
|
192
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class Grammaphone
|
2
|
+
class ParseError < StandardError; end
|
3
|
+
|
4
|
+
class RulesetError < ParseError
|
5
|
+
def message
|
6
|
+
super + "Problem with ruleset definition"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class EmptyRulesetError < RulesetError
|
11
|
+
def message
|
12
|
+
super + ": empty ruleset not allowed"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class TokenError < ParseError; end
|
17
|
+
|
18
|
+
class NonstringTokenError < TokenError
|
19
|
+
def message
|
20
|
+
super + "Token not a String"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class TokenStreamError < TokenError
|
25
|
+
def message
|
26
|
+
super + "Non-Array-able types can't be tokenized"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require_relative "./errors"
|
2
|
+
class Grammaphone
|
3
|
+
private
|
4
|
+
|
5
|
+
class Rule
|
6
|
+
attr_reader :name
|
7
|
+
|
8
|
+
def initialize(name, rule, act = nil)
|
9
|
+
raise ArgumentError.new("rule names must be a String or Symbol") unless (name.kind_of?(Symbol) || name.kind_of?(String))
|
10
|
+
@name = name.to_sym
|
11
|
+
self.rule = rule
|
12
|
+
self.action = act
|
13
|
+
end
|
14
|
+
|
15
|
+
def rule
|
16
|
+
@rule.dup
|
17
|
+
end
|
18
|
+
|
19
|
+
def rule=(rule)
|
20
|
+
case rule
|
21
|
+
when Array
|
22
|
+
raise ArgumentError.new("grammar rule as an Array must contain only Strings") unless rule.all?{|r| r.kind_of?(String)}
|
23
|
+
@rule = rule.dup
|
24
|
+
when String
|
25
|
+
@rule = [rule.dup]
|
26
|
+
else
|
27
|
+
raise ArgumentError.new("grammar rule must be a String or Array of Strings")
|
28
|
+
end
|
29
|
+
@allows_empty = @rule.any?{|r| r.empty?}
|
30
|
+
end
|
31
|
+
|
32
|
+
# action expected to return an Array-like object with flatten implemented
|
33
|
+
def action=(action)
|
34
|
+
raise ArgumentError.new("rule actions must be a proc") unless (action.kind_of?(Proc) || action.kind_of?(NilClass))
|
35
|
+
if action.nil?
|
36
|
+
@action = lambda {|tokens, name| token}
|
37
|
+
else
|
38
|
+
@action = action
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
if block_given?
|
44
|
+
@rule.each do |r|
|
45
|
+
yield r.split(" ")
|
46
|
+
end
|
47
|
+
else
|
48
|
+
to_enum(:each)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def allows_empty?
|
53
|
+
@allows_empty
|
54
|
+
end
|
55
|
+
|
56
|
+
def trigger(node)
|
57
|
+
@action.call(node, name)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
require_relative "./errors"
|
2
|
+
class Grammaphone
|
3
|
+
# This is not a descendant of Enumerator. This is explicit and intentional,
|
4
|
+
# due to use as an almost tree-like object. This implementation makes it behave
|
5
|
+
# as a near-functional list structure, which is extremely useful for this parser.
|
6
|
+
class TokenStream
|
7
|
+
# This doesn't need to be here, but it could potentially be useful
|
8
|
+
include Enumerable
|
9
|
+
|
10
|
+
def initialize(tokens, &split_method)
|
11
|
+
case tokens
|
12
|
+
when String
|
13
|
+
if split_method.nil?
|
14
|
+
@enum = tokens.split(" ")
|
15
|
+
else
|
16
|
+
@enum = split_method.call(tokens).to_a
|
17
|
+
end
|
18
|
+
when Array
|
19
|
+
@enum = tokens.dup
|
20
|
+
else
|
21
|
+
raise TokenStreamError unless tokens.respond_to?(:to_a)
|
22
|
+
@enum = tokens.to_a.dup # dup just in case
|
23
|
+
end
|
24
|
+
@pointer = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
# This ensures that all instances refer to the exact same token stream,
|
28
|
+
# but not necessarily at the same point. This saves a great deal of
|
29
|
+
# memory, without risking stream data integrity.
|
30
|
+
def initialize_copy(orig)
|
31
|
+
@enum = orig.instance_variable_get(:@enum)
|
32
|
+
super
|
33
|
+
end
|
34
|
+
|
35
|
+
# Gets the next non-empty token, consuming all viewed tokens.
|
36
|
+
#
|
37
|
+
# Follows the same relationship as `peek` and `peek_token`
|
38
|
+
def next
|
39
|
+
token = next_token
|
40
|
+
token = next_token while token&.empty?
|
41
|
+
token
|
42
|
+
end
|
43
|
+
|
44
|
+
# Gets the next token, consuming it.
|
45
|
+
def next_token
|
46
|
+
token = @enum[@pointer]
|
47
|
+
raise NonstringTokenError unless token.nil? || token.kind_of?(String)
|
48
|
+
@pointer += 1
|
49
|
+
token
|
50
|
+
end
|
51
|
+
|
52
|
+
# Peeks at the nth token from the current pointer, not counting empty tokens,
|
53
|
+
# not consuming any tokens.
|
54
|
+
#
|
55
|
+
# if no count is given, deaults to the next immediate token.
|
56
|
+
#
|
57
|
+
# Follows the same relationship as `next` and `next_token`
|
58
|
+
def peek(n = 0)
|
59
|
+
offset = (0..n).inject(0) do |acc, p|
|
60
|
+
peek_token(p)&.empty? ? acc + 1 : acc
|
61
|
+
end
|
62
|
+
peek_token(n + offset)
|
63
|
+
end
|
64
|
+
|
65
|
+
# Peeks at the nth token from the current pointer, not consuming it.
|
66
|
+
#
|
67
|
+
# If no count is given, defaults to the next immediate token.
|
68
|
+
def peek_token(n = 0)
|
69
|
+
raise ArgumentError.new("can't look back in the token stream") if n < 0
|
70
|
+
@enum[@pointer + n]
|
71
|
+
end
|
72
|
+
|
73
|
+
# Consumes the next n tokens, returning `self`.
|
74
|
+
#
|
75
|
+
# This has no meaningful effect if the stream is empty.
|
76
|
+
#
|
77
|
+
# If no count is given, defaults to consuming a single token
|
78
|
+
def skip(n = 1)
|
79
|
+
@pointer += n
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
# Resets the pointer to the beginning of the token stream.
|
84
|
+
def reset
|
85
|
+
@pointer = 0
|
86
|
+
self
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns `true` if there are no tokens remaining in the stream and `false`
|
90
|
+
# otherwise. That is, any calls to `peek_token`, `peek`, `next_token`, or
|
91
|
+
# `next` are guaranteed to return `nil` if `empty?` returns `true`.
|
92
|
+
def empty?
|
93
|
+
@pointer >= @enum.size
|
94
|
+
end
|
95
|
+
|
96
|
+
# Provided because there's a chance that it'll be useful. At the very least,
|
97
|
+
# it can't hurt, since any arrays produced are copies.
|
98
|
+
def each
|
99
|
+
if block_given?
|
100
|
+
@enum.each { |t| yield t }
|
101
|
+
self
|
102
|
+
else
|
103
|
+
to_enum(:each)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns the remaining tokens as an Array.
|
108
|
+
def to_a
|
109
|
+
@enum[@pointer..].dup
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
module Token
|
114
|
+
LITERAL_PREFIX = "\""
|
115
|
+
|
116
|
+
def self.literal?(token)
|
117
|
+
token[0] == LITERAL_PREFIX
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.clean_literal(token)
|
121
|
+
token[1..]
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.matches_literal?(element, token)
|
125
|
+
!token.nil? && literal?(element) && token == clean_literal(element)
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.pattern?(token)
|
129
|
+
token[0] == "/" && token[-1] == "/"
|
130
|
+
end
|
131
|
+
|
132
|
+
def self.clean_pattern(token)
|
133
|
+
/\A#{token[1...-1]}\Z/
|
134
|
+
end
|
135
|
+
|
136
|
+
def self.matches_pattern?(element, token)
|
137
|
+
pattern?(element) && (token =~ clean_pattern(element)) ||
|
138
|
+
token.nil? && "" =~ clean_pattern(element)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
metadata
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: grammaphone
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kellen Watt
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A dynamic parser written in Ruby that uses a BNF-adjascent grammar.
|
14
|
+
email: kbw6d9@mst.edu
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/grammaphone.rb
|
20
|
+
- lib/grammaphone/errors.rb
|
21
|
+
- lib/grammaphone/rule.rb
|
22
|
+
- lib/grammaphone/tokens.rb
|
23
|
+
homepage: https://github.com/KellenWatt/grammaphone
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubygems_version: 3.1.2
|
43
|
+
signing_key:
|
44
|
+
specification_version: 4
|
45
|
+
summary: A pure Ruby dynamic parser
|
46
|
+
test_files: []
|