grammaphone 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a6fc51b323812f7278f9e51540cccbc1df3530c17da6abc75f9fb7c97065433d
4
+ data.tar.gz: 3e17ab606b44b15d37445d81a73102fb2122569ecc43839c40c25498f6ebd340
5
+ SHA512:
6
+ metadata.gz: b06c48105fd3b8b3682fe66fa1ec747d779100174724dd2b17de1c316e9b6dc81ff64aedebda0c9c97fa5654c7a7bf9767d4d07d9f15bd59f7d13e354d6f977f
7
+ data.tar.gz: 492dce702c1fa07ce1258c5ff62c32bfc8c8669ec224def19fba81eb8fc7ee69b83f7ca6d8b8518ab7b4d5e491538efbe32c224cea635f80f8d3008a0ad16aee
@@ -0,0 +1,192 @@
1
+ # Grammaphone is a dynamically-definable parser pseudo-generator based on a
2
+ # BNF-like grammar.
3
+ #
4
+ # ### Grammar
5
+ # A grammar is defined using key-value pairs in a hash. This is viewed as a
6
+ # names with associated sets of patterns it can match against. A name can be
7
+ # any sequence of characters that are valid in a Ruby String (i.e. any Unicode
8
+ # character). Similarly, the patterns can be composed of any Ruby-valid characters.
9
+ #
10
+ # ### Writing a Rule
11
+ # A rule is a list of element identifiers, separated by spaces (`" "` or `\x20`).
12
+ #
13
+ # Element identifiers fall into three categories: literals, patterns, and rules.
14
+ #
15
+ # A literal is a string of characters that will be matched iff the token matches
16
+ # the literal text *exactly*. Literal sequences are preceeded by one double-quote
17
+ # (`"` or `\x22`). For example, to match the exact string "hello", the literal
18
+ # string you would use is `"hello`. Note that the initial double-quote is not
19
+ # included in the literal itself.
20
+ #
21
+ # A pattern is a string of characters representing a Ruby-valid regex. Pattern
22
+ # sequences are surrounded by one forward slash (`"/"` or `\x2F`) on each end.
23
+ # For example, to match a capitalized name composed strictly of ASCII letters,
24
+ # (e.g. "April", "John", "Alex"), the pattern string could be `/[A-Z][a-z]*/`.
25
+ # Note that every pattern must strictly match the whole token. Anything else
26
+ # won't be matched.
27
+ #
28
+ # A rule identifier is a string of characters representing the name of a grammatical
29
+ # rule. Rule identifier sequences are trivial, and are specified by using the
30
+ # exact name of the rule with no decorations. For example, to reference a rule
31
+ # named "NUMBER", the Rule string is `NUMBER`. Note that, like any grammar, a rule
32
+ # can refer to itself to define a recursive pattern. If a rule name is specified
33
+ # that doesn't exist in the current grammar, an exception will be raised and
34
+ # parsing will immediately stop.
35
+ #
36
+ # A rule is composed of zero or more element identifiers, which are evaluated in
37
+ # order. If a rule has no identifiers, then it will only match an empty token
38
+ # list, which will always succeed.
39
+ #
40
+ # Multiple options for a rule can be specified by passing an Array where each
41
+ # element of the Array is a valid rule. These rules are treated as possibilities
42
+ # for matching, with a precedence specified by the order.
43
+ #
44
+ # #### Example
45
+ # The two most common introductory programs are "Hello, world!", and an
46
+ # introduction program, given a name. For the purposes of this example, the latter
47
+ # prints in the format "Hello, \<name\>!", where `<name>` is the name entered.
48
+ #
49
+ # The following Hash describes the grammar that matches the output of these programs,
50
+ # assuming they are tokenized as ["Hello", ",", " ", \<name\>/world, "!"]. That
51
+ # tokenization is not default, but is assumed for the purposes of this example.
52
+ # This is by no means the only possible grammar, just an example.
53
+ #
54
+ # ```ruby
55
+ # {
56
+ # START: '"Hello ", /\s/ NAME "!',
57
+ # NAME: ['"world', '/[A-Z][a-z]*/']
58
+ # }
59
+ # ```
60
+ #
61
+ # Note that to match a space, you need to use the pattern, since the splitting function
62
+ # for rules splits on the space character, regardless of where it is.
63
+
64
+ require_relative "grammaphone/errors"
65
+ require_relative "grammaphone/tokens"
66
+ require_relative "grammaphone/rule"
67
+
68
+ class Grammaphone
69
+
70
+ def self.tokenize(src, &split_method)
71
+ TokenStream.new(src, &split_method)
72
+ end
73
+
74
+ # node_type must accept a
75
+ def initialize(rules = {}, node_type = Array, &default_action)
76
+ raise ArgumentError.new("cannot form parser from a #{rules.class}") unless rules.kind_of? Hash
77
+ raise ArgumentError.new("syntax tree type must respond to <<") unless node_type.method_defined?(:"<<")
78
+ @default_action = (default_action.nil? ? lambda{|node, name| node} : default_action)
79
+ @node_type = node_type
80
+ @rules = rules.map do |k, v|
81
+ Rule.new(k, v, @default_action)
82
+ end
83
+ end
84
+
85
+ def add_rule(name, rule, &action)
86
+ m = @rules.find {|r| r.name == name}
87
+ action = @default_action if action.nil?
88
+ if m.nil?
89
+ @rules << Rule.new(name, rule, action)
90
+ else
91
+ m.rule = rule
92
+ m.action = action
93
+ end
94
+ end
95
+
96
+ def rules
97
+ @rules.map{|r| [r.name, r.rule]}.to_h
98
+ end
99
+
100
+ def parse(token_stream)
101
+ token_stream = TokenStream.new(token_stream) unless token_stream.kind_of?(TokenStream)
102
+ raise EmptyRulesetError if @rules.size == 0
103
+ res = self.send(@rules[0].name, token_stream, @node_type)
104
+ res
105
+ end
106
+
107
+ # Not to be released in shipped version
108
+ def test(name, token_stream)
109
+ self.send(name, TokenStream.new(token_stream))
110
+ end
111
+
112
+ def respond_to_missing?(m, include_all)
113
+ (include_all && @rules.any?{|r| r.name == m}) || super
114
+ end
115
+
116
+ # This is fun, but it doesn't really take advantage of metaprogramming in a way
117
+ # that can't be accomplished with match_rule. It also lets the rules be "called"
118
+ # outside of normal context
119
+ def method_missing(m, *args, &block)
120
+ r = @rules.find{|r| r.name == m}
121
+ if r
122
+ match_rule(r, args[0], args[1])
123
+ else
124
+ super
125
+ end
126
+ end
127
+
128
+ private
129
+
130
+ def match_rule(r, stream, result_type)
131
+ # This is an enormous function. It needs to be pared down
132
+ matches = nil
133
+ result = result_type.new
134
+ r.each do |option|
135
+ tokens = stream.dup
136
+ break if option.empty?
137
+ matched = true
138
+
139
+ option.each do |element|
140
+ token = tokens.peek
141
+ # puts "rule: #{r.name}; element: #{element}; token: #{token}"
142
+ if Token.literal?(element)
143
+ unless Token.matches_literal?(element, token)
144
+ matches = nil
145
+ matched = false
146
+ break
147
+ end
148
+
149
+ matches ||= []
150
+ matches << token
151
+ result << token
152
+ tokens.next # might as well be tokens.skip
153
+ elsif Token.pattern?(element)
154
+ unless Token.matches_pattern?(element, token)
155
+ matches = nil
156
+ matched = false
157
+ break
158
+ end
159
+
160
+ matches ||= []
161
+ unless token.nil?
162
+ matches << token
163
+ result << token
164
+ end
165
+ tokens.next
166
+ else
167
+ raise TokenError.new("Can't have empty patterns") if element.empty?
168
+
169
+ submatches, res = self.send(element, tokens, result_type)
170
+ unless submatches
171
+ matches = nil
172
+ matched = false
173
+ break
174
+ end
175
+
176
+ matches ||= []
177
+ matches << submatches
178
+ result << res
179
+ tokens.skip([submatches.size, 1].max)
180
+ end
181
+ end
182
+
183
+ if matched
184
+ result = r.trigger(result)
185
+ break
186
+ end
187
+ end
188
+ # puts "matches for rule #{r.name}: #{matches.to_s}" unless matches.nil?
189
+ return false if matches.nil?
190
+ [matches, result]
191
+ end
192
+ end
@@ -0,0 +1,29 @@
1
+ class Grammaphone
2
+ class ParseError < StandardError; end
3
+
4
+ class RulesetError < ParseError
5
+ def message
6
+ super + "Problem with ruleset definition"
7
+ end
8
+ end
9
+
10
+ class EmptyRulesetError < RulesetError
11
+ def message
12
+ super + ": empty ruleset not allowed"
13
+ end
14
+ end
15
+
16
+ class TokenError < ParseError; end
17
+
18
+ class NonstringTokenError < TokenError
19
+ def message
20
+ super + "Token not a String"
21
+ end
22
+ end
23
+
24
+ class TokenStreamError < TokenError
25
+ def message
26
+ super + "Non-Array-able types can't be tokenized"
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,60 @@
1
+ require_relative "./errors"
2
+ class Grammaphone
3
+ private
4
+
5
+ class Rule
6
+ attr_reader :name
7
+
8
+ def initialize(name, rule, act = nil)
9
+ raise ArgumentError.new("rule names must be a String or Symbol") unless (name.kind_of?(Symbol) || name.kind_of?(String))
10
+ @name = name.to_sym
11
+ self.rule = rule
12
+ self.action = act
13
+ end
14
+
15
+ def rule
16
+ @rule.dup
17
+ end
18
+
19
+ def rule=(rule)
20
+ case rule
21
+ when Array
22
+ raise ArgumentError.new("grammar rule as an Array must contain only Strings") unless rule.all?{|r| r.kind_of?(String)}
23
+ @rule = rule.dup
24
+ when String
25
+ @rule = [rule.dup]
26
+ else
27
+ raise ArgumentError.new("grammar rule must be a String or Array of Strings")
28
+ end
29
+ @allows_empty = @rule.any?{|r| r.empty?}
30
+ end
31
+
32
+ # action expected to return an Array-like object with flatten implemented
33
+ def action=(action)
34
+ raise ArgumentError.new("rule actions must be a proc") unless (action.kind_of?(Proc) || action.kind_of?(NilClass))
35
+ if action.nil?
36
+ @action = lambda {|tokens, name| token}
37
+ else
38
+ @action = action
39
+ end
40
+ end
41
+
42
+ def each
43
+ if block_given?
44
+ @rule.each do |r|
45
+ yield r.split(" ")
46
+ end
47
+ else
48
+ to_enum(:each)
49
+ end
50
+ end
51
+
52
+ def allows_empty?
53
+ @allows_empty
54
+ end
55
+
56
+ def trigger(node)
57
+ @action.call(node, name)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,141 @@
1
+ require_relative "./errors"
2
+ class Grammaphone
3
+ # This is not a descendant of Enumerator. This is explicit and intentional,
4
+ # due to use as an almost tree-like object. This implementation makes it behave
5
+ # as a near-functional list structure, which is extremely useful for this parser.
6
+ class TokenStream
7
+ # This doesn't need to be here, but it could potentially be useful
8
+ include Enumerable
9
+
10
+ def initialize(tokens, &split_method)
11
+ case tokens
12
+ when String
13
+ if split_method.nil?
14
+ @enum = tokens.split(" ")
15
+ else
16
+ @enum = split_method.call(tokens).to_a
17
+ end
18
+ when Array
19
+ @enum = tokens.dup
20
+ else
21
+ raise TokenStreamError unless tokens.respond_to?(:to_a)
22
+ @enum = tokens.to_a.dup # dup just in case
23
+ end
24
+ @pointer = 0
25
+ end
26
+
27
+ # This ensures that all instances refer to the exact same token stream,
28
+ # but not necessarily at the same point. This saves a great deal of
29
+ # memory, without risking stream data integrity.
30
+ def initialize_copy(orig)
31
+ @enum = orig.instance_variable_get(:@enum)
32
+ super
33
+ end
34
+
35
+ # Gets the next non-empty token, consuming all viewed tokens.
36
+ #
37
+ # Follows the same relationship as `peek` and `peek_token`
38
+ def next
39
+ token = next_token
40
+ token = next_token while token&.empty?
41
+ token
42
+ end
43
+
44
+ # Gets the next token, consuming it.
45
+ def next_token
46
+ token = @enum[@pointer]
47
+ raise NonstringTokenError unless token.nil? || token.kind_of?(String)
48
+ @pointer += 1
49
+ token
50
+ end
51
+
52
+ # Peeks at the nth token from the current pointer, not counting empty tokens,
53
+ # not consuming any tokens.
54
+ #
55
+ # if no count is given, deaults to the next immediate token.
56
+ #
57
+ # Follows the same relationship as `next` and `next_token`
58
+ def peek(n = 0)
59
+ offset = (0..n).inject(0) do |acc, p|
60
+ peek_token(p)&.empty? ? acc + 1 : acc
61
+ end
62
+ peek_token(n + offset)
63
+ end
64
+
65
+ # Peeks at the nth token from the current pointer, not consuming it.
66
+ #
67
+ # If no count is given, defaults to the next immediate token.
68
+ def peek_token(n = 0)
69
+ raise ArgumentError.new("can't look back in the token stream") if n < 0
70
+ @enum[@pointer + n]
71
+ end
72
+
73
+ # Consumes the next n tokens, returning `self`.
74
+ #
75
+ # This has no meaningful effect if the stream is empty.
76
+ #
77
+ # If no count is given, defaults to consuming a single token
78
+ def skip(n = 1)
79
+ @pointer += n
80
+ self
81
+ end
82
+
83
+ # Resets the pointer to the beginning of the token stream.
84
+ def reset
85
+ @pointer = 0
86
+ self
87
+ end
88
+
89
+ # Returns `true` if there are no tokens remaining in the stream and `false`
90
+ # otherwise. That is, any calls to `peek_token`, `peek`, `next_token`, or
91
+ # `next` are guaranteed to return `nil` if `empty?` returns `true`.
92
+ def empty?
93
+ @pointer >= @enum.size
94
+ end
95
+
96
+ # Provided because there's a chance that it'll be useful. At the very least,
97
+ # it can't hurt, since any arrays produced are copies.
98
+ def each
99
+ if block_given?
100
+ @enum.each { |t| yield t }
101
+ self
102
+ else
103
+ to_enum(:each)
104
+ end
105
+ end
106
+
107
+ # Returns the remaining tokens as an Array.
108
+ def to_a
109
+ @enum[@pointer..].dup
110
+ end
111
+ end
112
+
113
+ module Token
114
+ LITERAL_PREFIX = "\""
115
+
116
+ def self.literal?(token)
117
+ token[0] == LITERAL_PREFIX
118
+ end
119
+
120
+ def self.clean_literal(token)
121
+ token[1..]
122
+ end
123
+
124
+ def self.matches_literal?(element, token)
125
+ !token.nil? && literal?(element) && token == clean_literal(element)
126
+ end
127
+
128
+ def self.pattern?(token)
129
+ token[0] == "/" && token[-1] == "/"
130
+ end
131
+
132
+ def self.clean_pattern(token)
133
+ /\A#{token[1...-1]}\Z/
134
+ end
135
+
136
+ def self.matches_pattern?(element, token)
137
+ pattern?(element) && (token =~ clean_pattern(element)) ||
138
+ token.nil? && "" =~ clean_pattern(element)
139
+ end
140
+ end
141
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: grammaphone
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kellen Watt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-15 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A dynamic parser written in Ruby that uses a BNF-adjascent grammar.
14
+ email: kbw6d9@mst.edu
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/grammaphone.rb
20
+ - lib/grammaphone/errors.rb
21
+ - lib/grammaphone/rule.rb
22
+ - lib/grammaphone/tokens.rb
23
+ homepage: https://github.com/KellenWatt/grammaphone
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubygems_version: 3.1.2
43
+ signing_key:
44
+ specification_version: 4
45
+ summary: A pure Ruby dynamic parser
46
+ test_files: []