ebnf 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/AUTHORS +1 -0
- data/CREDITS +0 -0
- data/README.md +170 -0
- data/UNLICENSE +24 -0
- data/VERSION +1 -0
- data/bin/ebnf +54 -0
- data/etc/doap.ttl +33 -0
- data/etc/ebnf.bnf +54 -0
- data/lib/ebnf.rb +1029 -0
- data/lib/ebnf/ll1/lexer.rb +475 -0
- data/lib/ebnf/ll1/parser.rb +541 -0
- data/lib/ebnf/ll1/scanner.rb +101 -0
- data/lib/ebnf/version.rb +20 -0
- metadata +125 -0
data/AUTHORS
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
* Gregg Kellogg <gregg@greggkellogg.net>
|
data/CREDITS
ADDED
File without changes
|
data/README.md
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
# ebnf
|
2
|
+
|
3
|
+
[EBNF][] parser and generic parser generator.
|
4
|
+
|
5
|
+
## Description
|
6
|
+
This is a [Ruby][] implementation of an [EBNF][] and [BNF][] parser and parser generator.
|
7
|
+
It parses [EBNF][] grammars to [BNF][], generates [First/Follow][] and [Branch][] tables for
|
8
|
+
[LL(1)][] grammars, which can be used with the stream [Tokenizer][] and [LL(1) Parser][].
|
9
|
+
|
10
|
+
Of note in this implementation is that the tokenizer and parser are streaming, so that they can
|
11
|
+
process inputs of arbitrary size.
|
12
|
+
|
13
|
+
## Usage
|
14
|
+
### Parsing an LL(1) Grammar
|
15
|
+
|
16
|
+
require 'ebnf'
|
17
|
+
|
18
|
+
ebnf = EBNF.parse(File.open(./etc/ebnf.bnf))
|
19
|
+
|
20
|
+
Output rules and terminals as S-Expressions, Turtle or EBNF
|
21
|
+
|
22
|
+
puts ebnf.to_sxp
|
23
|
+
puts ebnf.to_ttl
|
24
|
+
puts ebnf.to_ebnf
|
25
|
+
|
26
|
+
Transform EBNF to BNF (generates `alt` or `seq` from `plus`, `star` or `opt`)
|
27
|
+
|
28
|
+
ebnf.make_bnf
|
29
|
+
|
30
|
+
Generate [First/Follow][] rules for BNF grammars
|
31
|
+
|
32
|
+
ebnf.first_follow(start_tokens)
|
33
|
+
|
34
|
+
Generate [Branch][], terminal and [First/Follow][] tables as Ruby for parsing grammars
|
35
|
+
|
36
|
+
ebnf.to_ruby
|
37
|
+
|
38
|
+
### Creating terminal definitions and parser rules to parse generated grammars
|
39
|
+
|
40
|
+
The parser is initialized to callbacks invoked on entry and exit
|
41
|
+
to each `terminal` and `production`. A trivial parser loop can be described as follows:
|
42
|
+
|
43
|
+
require 'ebnf/ll1/parser'
|
44
|
+
require 'meta'
|
45
|
+
|
46
|
+
class Parser
|
47
|
+
include Meta
|
48
|
+
|
49
|
+
terminal(:SYMBOL, /([a-z]|[A-Z]|[0-9]|_)+/) do |parser, prod, token, input|
|
50
|
+
# Add data based on scanned token to input
|
51
|
+
input[:symbol] = token.value
|
52
|
+
end
|
53
|
+
|
54
|
+
production(:rule) do |parser, phase, input, current, callback|
|
55
|
+
# Process on start of production when phase == :start
|
56
|
+
# Set state for entry into recursed rules through current
|
57
|
+
|
58
|
+
# Process on end of production when phase == :finish
|
59
|
+
# return results in input, retrieve results from recursed rules in current
|
60
|
+
|
61
|
+
# Callback to parser loop with callback
|
62
|
+
end
|
63
|
+
|
64
|
+
def initialize(input)
|
65
|
+
parser_options = {
|
66
|
+
:branch => BRANCH,
|
67
|
+
:first => FIRST,
|
68
|
+
:follow => FOLLOW
|
69
|
+
}
|
70
|
+
parse(input, start_symbol, parser_options) do |context, *data|
|
71
|
+
# Process calls from callback from productions
|
72
|
+
|
73
|
+
rescue ArgumentError, RDF::LL1::Parser::Error => e
|
74
|
+
progress("Parsing completed with errors:\n\t#{e.message}")
|
75
|
+
raise RDF::ReaderError, e.message if validate?
|
76
|
+
end
|
77
|
+
|
78
|
+
## EBNF Grammar
|
79
|
+
The [EBNF][] variant used here is based on [W3C][] [EBNF][] as defined in the
|
80
|
+
[XML 1.0 recommendation](http://www.w3.org/TR/REC-xml/), with minor extensions.
|
81
|
+
|
82
|
+
/* An EBNF grammar for EBNF */
|
83
|
+
[1] ebnf ::= (declaration | rule)*
|
84
|
+
|
85
|
+
[2] declaration ::= '@terminals' | '@pass'
|
86
|
+
|
87
|
+
[3] rule ::= lhs '::=' expression
|
88
|
+
|
89
|
+
[4] lhs ::= '[' (SYMBOL | '.')+ ']' SYMBOL
|
90
|
+
|
91
|
+
[5] expression ::= alt
|
92
|
+
|
93
|
+
[6] alt ::= seq ('|' seq)*
|
94
|
+
|
95
|
+
[7] seq ::= diff+
|
96
|
+
|
97
|
+
[8] diff ::= postfix ('-' postfix)*
|
98
|
+
|
99
|
+
[9] postfix ::= primary ( [?*+] )?
|
100
|
+
|
101
|
+
[10] primary ::= HEX
|
102
|
+
| RANGE
|
103
|
+
| ENUM
|
104
|
+
| O_RANGE
|
105
|
+
| O_ENUM
|
106
|
+
| STRING1
|
107
|
+
| STRING2
|
108
|
+
| '(' expression ')'
|
109
|
+
|
110
|
+
@terminals
|
111
|
+
|
112
|
+
[11] SYMBOL ::= ([a-z] | [A-Z] | [0-9] | "_")+
|
113
|
+
|
114
|
+
[12] HEX ::= '#x' ([0-9] | [a-f] | [A-F])+
|
115
|
+
|
116
|
+
[13] RANGE ::= '[' CHAR '-' CHAR ']'
|
117
|
+
|
118
|
+
[14] ENUM ::= '[' CHAR+ ']'
|
119
|
+
|
120
|
+
[15] O_RANGE ::= '[^' CHAR '-' CHAR ']'
|
121
|
+
|
122
|
+
[16] OENUM ::= '[^' CHAR+ ']'
|
123
|
+
|
124
|
+
[17] STRING1 ::= '"' (CHAR - '"')* '"'
|
125
|
+
|
126
|
+
[18] STRING2 ::= "'" (CHAR - "'")* "'"
|
127
|
+
|
128
|
+
[19] CHAR ::= HEX
|
129
|
+
| ('\\' [\\trn'"])
|
130
|
+
| [^\t\r\n'"]
|
131
|
+
|
132
|
+
@pass ::= (
|
133
|
+
[#x20\t\r\n]
|
134
|
+
|
|
135
|
+
)+
|
136
|
+
|
137
|
+
## Documentation
|
138
|
+
Full documentation available on [Rubydoc.info][EBNF doc].
|
139
|
+
|
140
|
+
## Author
|
141
|
+
* [Gregg Kellogg](http://github.com/gkellogg) - <http://greggkellogg.net/>
|
142
|
+
|
143
|
+
## Contributing
|
144
|
+
* Do your best to adhere to the existing coding conventions and idioms.
|
145
|
+
* Don't use hard tabs, and don't leave trailing whitespace on any line.
|
146
|
+
* Do document every method you add using [YARD][] annotations. Read the
|
147
|
+
[tutorial][YARD-GS] or just look at the existing code for examples.
|
148
|
+
* Don't touch the `.gemspec`, `VERSION` or `AUTHORS` files. If you need to
|
149
|
+
change them, do so on your private branch only.
|
150
|
+
* Do feel free to add yourself to the `CREDITS` file and the corresponding
|
151
|
+
list in the the `README`. Alphabetical order applies.
|
152
|
+
* Do note that in order for us to merge any non-trivial changes (as a rule
|
153
|
+
of thumb, additions larger than about 15 lines of code), we need an
|
154
|
+
explicit [public domain dedication][PDD] on record from you.
|
155
|
+
|
156
|
+
## License
|
157
|
+
This is free and unencumbered public domain software. For more information,
|
158
|
+
see <http://unlicense.org/> or the accompanying {file:UNLICENSE} file.
|
159
|
+
|
160
|
+
[Ruby]: http://ruby-lang.org/
|
161
|
+
[YARD]: http://yardoc.org/
|
162
|
+
[YARD-GS]: http://rubydoc.info/docs/yard/file/docs/GettingStarted.md
|
163
|
+
[PDD]: http://lists.w3.org/Archives/Public/public-rdf-ruby/2010May/0013.html
|
164
|
+
[EBNF]: http://www.w3.org/TR/REC-xml/#sec-notation
|
165
|
+
[EBNF doc]:
|
166
|
+
[First/Follow]: http://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
|
167
|
+
[Branch]:
|
168
|
+
[LL(1)]:
|
169
|
+
[LL(1) Parser]: http://en.wikipedia.org/wiki/LL_parser
|
170
|
+
[Tokenizer]:
|
data/UNLICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
This is free and unencumbered software released into the public domain.
|
2
|
+
|
3
|
+
Anyone is free to copy, modify, publish, use, compile, sell, or
|
4
|
+
distribute this software, either in source code form or as a compiled
|
5
|
+
binary, for any purpose, commercial or non-commercial, and by any
|
6
|
+
means.
|
7
|
+
|
8
|
+
In jurisdictions that recognize copyright laws, the author or authors
|
9
|
+
of this software dedicate any and all copyright interest in the
|
10
|
+
software to the public domain. We make this dedication for the benefit
|
11
|
+
of the public at large and to the detriment of our heirs and
|
12
|
+
successors. We intend this dedication to be an overt act of
|
13
|
+
relinquishment in perpetuity of all present and future rights to this
|
14
|
+
software under copyright law.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
19
|
+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
20
|
+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
21
|
+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
|
24
|
+
For more information, please refer to <http://unlicense.org/>
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/bin/ebnf
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# ebnf2ttl --- Generate reasoned Turtle representation of EBNF input file
|
3
|
+
# to be used in extracting parser branch tables (see gramLL1).
|
4
|
+
|
5
|
+
$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "..", 'lib')))
|
6
|
+
require "bundler/setup"
|
7
|
+
require 'rubygems'
|
8
|
+
require 'getoptlong'
|
9
|
+
require 'ebnf'
|
10
|
+
require 'sxp'
|
11
|
+
|
12
|
+
options = {
|
13
|
+
:format => :sxp,
|
14
|
+
:prefix => "ttl",
|
15
|
+
:namespace => "http://www.w3.org/ns/formats/Turtle#",
|
16
|
+
}
|
17
|
+
|
18
|
+
out = STDOUT
|
19
|
+
|
20
|
+
opts = GetoptLong.new(
|
21
|
+
["--dbg", GetoptLong::NO_ARGUMENT],
|
22
|
+
["--bnf", GetoptLong::NO_ARGUMENT],
|
23
|
+
["--execute", "-e", GetoptLong::REQUIRED_ARGUMENT],
|
24
|
+
["--output", "-o", GetoptLong::REQUIRED_ARGUMENT],
|
25
|
+
["--format", "-f", GetoptLong::REQUIRED_ARGUMENT],
|
26
|
+
["--prefix", "-p", GetoptLong::REQUIRED_ARGUMENT],
|
27
|
+
["--namespace", "-n", GetoptLong::REQUIRED_ARGUMENT],
|
28
|
+
["--verbose", GetoptLong::NO_ARGUMENT]
|
29
|
+
)
|
30
|
+
|
31
|
+
opts.each do |opt, arg|
|
32
|
+
case opt
|
33
|
+
when '--dbg' then options[:debug] = true
|
34
|
+
when '--bnf' then options[:bnf] = true
|
35
|
+
when '--execute' then input = arg
|
36
|
+
when '--format' then options[:format] = arg.to_sym
|
37
|
+
when '--output' then out = File.open(arg, "w")
|
38
|
+
when '--prefix' then options[:prefix] = arg
|
39
|
+
when '--namespace' then options[:namespace] = arg
|
40
|
+
when '--verbose' then $verbose = true
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
input = File.open(ARGV[0]) if ARGV[0]
|
45
|
+
|
46
|
+
ebnf = EBNF.new(input || STDIN, options)
|
47
|
+
ebnf = ebnf.make_bnf if options[:bnf]
|
48
|
+
res = case options[:format]
|
49
|
+
when :sxp then ebnf.to_sxp
|
50
|
+
when :ttl then ebnf.to_ttl(options[:prefix], options[:namespace])
|
51
|
+
else ebnf.ast.inspect
|
52
|
+
end
|
53
|
+
|
54
|
+
out.puts res
|
data/etc/doap.ttl
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
2
|
+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
3
|
+
@prefix dc: <http://purl.org/dc/terms/> .
|
4
|
+
@prefix earl: <http://www.w3.org/ns/earl#> .
|
5
|
+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
|
6
|
+
@prefix doap: <http://usefulinc.com/ns/doap#> .
|
7
|
+
@prefix ex: <http://example.org/> .
|
8
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
9
|
+
|
10
|
+
<http://rubygems.org/gems/rdf-turtle> a doap:Project, earl:TestSubject, earl:Software ;
|
11
|
+
doap:name "RDF::Turtle" ;
|
12
|
+
doap:homepage <http://ruby-rdf.github.com/rdf-turtle> ;
|
13
|
+
doap:license <http://creativecommons.org/licenses/publicdomain/> ;
|
14
|
+
doap:shortdesc "Turtle reader/writer for Ruby."@en ;
|
15
|
+
doap:description "RDF::Turtle is an Turtle reader/writer for the RDF.rb library suite."@en ;
|
16
|
+
doap:created "2011-08-29"^^xsd:date ;
|
17
|
+
doap:programming-language "Ruby" ;
|
18
|
+
doap:implements <http://www.w3.org/TR/turtle/> ;
|
19
|
+
doap:category <http://dbpedia.org/resource/Resource_Description_Framework>,
|
20
|
+
<http://dbpedia.org/resource/Ruby_(programming_language)> ;
|
21
|
+
doap:download-page <http://rubygems.org/gems/rdf-turtle> ;
|
22
|
+
doap:mailing-list <http://lists.w3.org/Archives/Public/public-rdf-ruby/> ;
|
23
|
+
doap:bug-database <http://github.com/ruby-rdf/rdf-turtle/issues> ;
|
24
|
+
doap:blog <http://greggkellogg.net/> ;
|
25
|
+
doap:developer <http://greggkellogg.net/foaf#me> ;
|
26
|
+
doap:maintainer <http://greggkellogg.net/foaf#me> ;
|
27
|
+
doap:documenter <http://greggkellogg.net/foaf#me> ;
|
28
|
+
foaf:maker <http://greggkellogg.net/foaf#me> ;
|
29
|
+
dc:title "RDF::Turtle" ;
|
30
|
+
dc:description "RDF::Turtle is an Turtle reader/writer for the RDF.rb library suite."@en ;
|
31
|
+
dc:date "2011-08-29"^^xsd:date ;
|
32
|
+
dc:creator <http://greggkellogg.net/foaf#me> ;
|
33
|
+
dc:isPartOf <http://rubygems.org/gems/rdf> .
|
data/etc/ebnf.bnf
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
/* An EBNF grammar for EBNF */
|
2
|
+
[1] ebnf ::= (declaration | rule)*
|
3
|
+
|
4
|
+
[2] declaration ::= '@terminals' | '@pass'
|
5
|
+
|
6
|
+
[3] rule ::= lhs '::=' expression
|
7
|
+
|
8
|
+
[4] lhs ::= '[' SYMBOL ']' SYMBOL
|
9
|
+
|
10
|
+
[5] expression ::= alt
|
11
|
+
|
12
|
+
[6] alt ::= seq ('|' seq)*
|
13
|
+
|
14
|
+
[7] seq ::= diff+
|
15
|
+
|
16
|
+
[8] diff ::= postfix ('-' postfix)*
|
17
|
+
|
18
|
+
[9] postfix ::= primary ( [?*+] )?
|
19
|
+
|
20
|
+
[10] primary ::= HEX
|
21
|
+
| RANGE
|
22
|
+
| ENUM
|
23
|
+
| O_RANGE
|
24
|
+
| O_ENUM
|
25
|
+
| STRING1
|
26
|
+
| STRING2
|
27
|
+
| '(' expression ')'
|
28
|
+
|
29
|
+
@terminals
|
30
|
+
|
31
|
+
[11] SYMBOL ::= ([a-z] | [A-Z] | [0-9] | "_")+
|
32
|
+
|
33
|
+
[12] HEX ::= '#x' ([0-9] | [a-f] | [A-F])+
|
34
|
+
|
35
|
+
[13] RANGE ::= '[' CHAR '-' CHAR ']'
|
36
|
+
|
37
|
+
[14] ENUM ::= '[' CHAR+ ']'
|
38
|
+
|
39
|
+
[15] O_RANGE ::= '[^' CHAR '-' CHAR ']'
|
40
|
+
|
41
|
+
[16] OENUM ::= '[^' CHAR+ ']'
|
42
|
+
|
43
|
+
[17] STRING1 ::= '"' (CHAR - '"')* '"'
|
44
|
+
|
45
|
+
[18] STRING2 ::= "'" (CHAR - "'")* "'"
|
46
|
+
|
47
|
+
[19] CHAR ::= HEX
|
48
|
+
| ('\\' [\\trn'"])
|
49
|
+
| [^\t\r\n'"]
|
50
|
+
|
51
|
+
@pass ::= (
|
52
|
+
[#x20\t\r\n]
|
53
|
+
|
|
54
|
+
)+
|
data/lib/ebnf.rb
ADDED
@@ -0,0 +1,1029 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
# Extended Bakus-Nour Form (EBNF), being the W3C variation is
|
4
|
+
# originaly defined in the
|
5
|
+
# [W3C XML 1.0 Spec](http://www.w3.org/TR/REC-xml/#sec-notation).
|
6
|
+
#
|
7
|
+
# This version attempts to be less strict than the strict definition
|
8
|
+
# to allow for coloquial variations (such as in the Turtle syntax).
|
9
|
+
#
|
10
|
+
# A rule takes the following form:
|
11
|
+
# \[1\] symbol ::= expression
|
12
|
+
#
|
13
|
+
# Comments include the content between '/*' and '*/'
|
14
|
+
#
|
15
|
+
# @see http://www.w3.org/2000/10/swap/grammar/ebnf2turtle.py
|
16
|
+
# @see http://www.w3.org/2000/10/swap/grammar/ebnf2bnf.n3
|
17
|
+
#
|
18
|
+
# Based on bnf2turtle by Dan Connolly.
|
19
|
+
#
|
20
|
+
# Motivation
|
21
|
+
# ----------
|
22
|
+
#
|
23
|
+
# Many specifications include grammars that look formal but are not
|
24
|
+
# actually checked, by machine, against test data sets. Debugging the
|
25
|
+
# grammar in the XML specification has been a long, tedious manual
|
26
|
+
# process. Only when the loop is closed between a fully formal grammar
|
27
|
+
# and a large test data set can we be confident that we have an accurate
|
28
|
+
# specification of a language (and even then, only the syntax of the language).
|
29
|
+
#
|
30
|
+
#
|
31
|
+
# The grammar in the [N3 design note][] has evolved based on the original
|
32
|
+
# manual transcription into a python recursive-descent parser and
|
33
|
+
# subsequent development of test cases. Rather than maintain the grammar
|
34
|
+
# and the parser independently, our [goal] is to formalize the language
|
35
|
+
# syntax sufficiently to replace the manual implementation with one
|
36
|
+
# derived mechanically from the specification.
|
37
|
+
#
|
38
|
+
#
|
39
|
+
# [N3 design note]: http://www.w3.org/DesignIssues/Notation3
|
40
|
+
#
|
41
|
+
# Related Work
|
42
|
+
# ------------
|
43
|
+
#
|
44
|
+
# Sean Palmer's [n3p announcement][] demonstrated the feasibility of the
|
45
|
+
# approach, though that work did not cover some aspects of N3.
|
46
|
+
#
|
47
|
+
# In development of the [SPARQL specification][], Eric Prud'hommeaux
|
48
|
+
# developed [Yacker][], which converts EBNF syntax to perl and C and C++
|
49
|
+
# yacc grammars. It includes an interactive facility for checking
|
50
|
+
# strings against the resulting grammars.
|
51
|
+
# Yosi Scharf used it in [cwm Release 1.1.0rc1][], which includes
|
52
|
+
# a SPAQRL parser that is *almost* completely mechanically generated.
|
53
|
+
#
|
54
|
+
# The N3/turtle output from yacker is lower level than the EBNF notation
|
55
|
+
# from the XML specification; it has the ?, +, and * operators compiled
|
56
|
+
# down to pure context-free rules, obscuring the grammar
|
57
|
+
# structure. Since that transformation is straightforwardly expressed in
|
58
|
+
# semantic web rules (see [bnf-rules.n3][]), it seems best to keep the RDF
|
59
|
+
# expression of the grammar in terms of the higher level EBNF
|
60
|
+
# constructs.
|
61
|
+
#
|
62
|
+
# [goal]: http://www.w3.org/2002/02/mid/1086902566.21030.1479.camel@dirk;list=public-cwm-bugs
|
63
|
+
# [n3p announcement]: http://lists.w3.org/Archives/Public/public-cwm-talk/2004OctDec/0029.html
|
64
|
+
# [Yacker]: http://www.w3.org/1999/02/26-modules/User/Yacker
|
65
|
+
# [SPARQL specification]: http://www.w3.org/TR/rdf-sparql-query/
|
66
|
+
# [Cwm Release 1.1.0rc1]: http://lists.w3.org/Archives/Public/public-cwm-announce/2005JulSep/0000.html
|
67
|
+
# [bnf-rules.n3]: http://www.w3.org/2000/10/swap/grammar/bnf-rules.n3
|
68
|
+
#
|
69
|
+
# Open Issues and Future Work
|
70
|
+
# ---------------------------
|
71
|
+
#
|
72
|
+
# The yacker output also has the terminals compiled to elaborate regular
|
73
|
+
# expressions. The best strategy for dealing with lexical tokens is not
|
74
|
+
# yet clear. Many tokens in SPARQL are case insensitive; this is not yet
|
75
|
+
# captured formally.
|
76
|
+
#
|
77
|
+
# The schema for the EBNF vocabulary used here (``g:seq``, ``g:alt``, ...)
|
78
|
+
# is not yet published; it should be aligned with [swap/grammar/bnf][]
|
79
|
+
# and the [bnf2html.n3][] rules (and/or the style of linked XHTML grammar
|
80
|
+
# in the SPARQL and XML specificiations).
|
81
|
+
#
|
82
|
+
# It would be interesting to corroborate the claim in the SPARQL spec
|
83
|
+
# that the grammar is LL(1) with a mechanical proof based on N3 rules.
|
84
|
+
#
|
85
|
+
# [swap/grammar/bnf]: http://www.w3.org/2000/10/swap/grammar/bnf
|
86
|
+
# [bnf2html.n3]: http://www.w3.org/2000/10/swap/grammar/bnf2html.n3
|
87
|
+
#
|
88
|
+
# Background
|
89
|
+
# ----------
|
90
|
+
#
|
91
|
+
# The [N3 Primer] by Tim Berners-Lee introduces RDF and the Semantic
|
92
|
+
# web using N3, a teaching and scribbling language. Turtle is a subset
|
93
|
+
# of N3 that maps directly to (and from) the standard XML syntax for
|
94
|
+
# RDF.
|
95
|
+
#
|
96
|
+
# [N3 Primer]: http://www.w3.org/2000/10/swap/Primer.html
|
97
|
+
#
|
98
|
+
# @author Gregg Kellogg
|
99
|
+
class EBNF
|
100
|
+
class Rule
|
101
|
+
# Operations which are flattened to seprate rules in to_bnf
|
102
|
+
BNF_OPS = %w{
|
103
|
+
seq alt diff opt star plus
|
104
|
+
}.map(&:to_sym).freeze
|
105
|
+
|
106
|
+
# @!attribute [rw] sym for rule
|
107
|
+
# @return [Symbol]
|
108
|
+
attr_accessor :sym
|
109
|
+
|
110
|
+
# @!attribute [rw] id of rule
|
111
|
+
# @return [String]
|
112
|
+
attr_accessor :id
|
113
|
+
|
114
|
+
# @!attribute [rw] kind of rule
|
115
|
+
# @return [:rule, :terminal, or :pass]
|
116
|
+
attr_accessor :kind
|
117
|
+
|
118
|
+
# @!attribute [rw] expr rule expression
|
119
|
+
# @return [Array]
|
120
|
+
attr_accessor :expr
|
121
|
+
|
122
|
+
# @!attribute [rw] orig original rule
|
123
|
+
# @return [String]
|
124
|
+
attr_accessor :orig
|
125
|
+
|
126
|
+
# @!attribute [r] first terminals that immediately procede this rule
|
127
|
+
# @return [Array<Rule>]
|
128
|
+
attr_reader :first
|
129
|
+
|
130
|
+
# @!attribute [r] follow terminals that immediately follow this rule
|
131
|
+
# @return [Array<Rule>]
|
132
|
+
attr_reader :follow
|
133
|
+
|
134
|
+
# @!attribute [rw] start indicates that this is a starting rule
|
135
|
+
# @return [Boolean]
|
136
|
+
attr_accessor :start
|
137
|
+
|
138
|
+
# @param [Integer] id
|
139
|
+
# @param [Symbol] sym
|
140
|
+
# @param [Array] expr
|
141
|
+
# @param [EBNF] ebnf
|
142
|
+
# @param [Hash{Symbol => Object}] option
|
143
|
+
# @option options [Symbol] :kind
|
144
|
+
# @option options [String] :ebnf
|
145
|
+
def initialize(sym, id, expr, options = {})
|
146
|
+
@sym, @id = sym, id
|
147
|
+
@expr = expr.is_a?(Array) ? expr : [:seq, expr]
|
148
|
+
@ebnf = options[:ebnf]
|
149
|
+
@kind = case
|
150
|
+
when options[:kind] then options[:kind]
|
151
|
+
when sym.to_s == sym.to_s.upcase then :terminal
|
152
|
+
when !BNF_OPS.include?(@expr.first) then :terminal
|
153
|
+
else :rule
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Serializes this rule to an S-Expression
|
158
|
+
# @return [String]
|
159
|
+
def to_sxp
|
160
|
+
elements = [sym, id, [:kind, kind]]
|
161
|
+
elements << [:start, true] if start
|
162
|
+
elements << first.sort_by(&:to_s).unshift(:first) if first
|
163
|
+
elements << follow.sort_by(&:to_s).unshift(:follow) if follow
|
164
|
+
elements << expr
|
165
|
+
elements.to_sxp
|
166
|
+
end
|
167
|
+
def to_s; to_sxp; end
|
168
|
+
|
169
|
+
# Serializes this rule to an Turtle
|
170
|
+
# @return [String]
|
171
|
+
def to_ttl
|
172
|
+
@ebnf.debug("to_ttl") {inspect}
|
173
|
+
comment = orig.strip.
|
174
|
+
gsub(/"""/, '\"\"\"').
|
175
|
+
gsub("\\", "\\\\").
|
176
|
+
sub(/^\"/, '\"').
|
177
|
+
sub(/\"$/m, '\"')
|
178
|
+
statements = [
|
179
|
+
%{:#{id} rdfs:label "#{id}"; rdf:value "#{sym}";},
|
180
|
+
%{ rdfs:comment #{comment.inspect};},
|
181
|
+
]
|
182
|
+
|
183
|
+
statements += ttl_expr(expr, kind == :terminal ? "re" : "g", 1, false)
|
184
|
+
"\n" + statements.join("\n")
|
185
|
+
end
|
186
|
+
|
187
|
+
##
|
188
|
+
# Transform EBNF rule to BNF rules:
|
189
|
+
#
|
190
|
+
# * Transform (a [n] rule (op1 (op2))) into two rules:
|
191
|
+
# (a [n] rule (op1 a.2))
|
192
|
+
# (_a_1 [n.1] rule (op2))
|
193
|
+
# * Transform (a rule (opt b)) into (a rule (alt _empty "foo"))
|
194
|
+
# * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
|
195
|
+
# * Transform (a rule (plus b)) into (a rule (seq b (star b)
|
196
|
+
# @return [Array<Rule>]
|
197
|
+
def to_bnf
|
198
|
+
new_rules = []
|
199
|
+
return [self] unless kind == :rule
|
200
|
+
|
201
|
+
# Look for rules containing recursive definition and rewrite to multiple rules. If `expr` contains elements which are in array form, where the first element of that array is a symbol, create a new rule for it.
|
202
|
+
if expr.any? {|e| e.is_a?(Array) && BNF_OPS.include?(e.first)}
|
203
|
+
# * Transform (a [n] rule (op1 (op2))) into two rules:
|
204
|
+
# (a.1 [n.1] rule (op1 a.2))
|
205
|
+
# (a.2 [n.2] rule (op2))
|
206
|
+
# duplicate ourselves for rewriting
|
207
|
+
this = dup
|
208
|
+
rule_seq = 1
|
209
|
+
new_rules << this
|
210
|
+
|
211
|
+
expr.each_with_index do |e, index|
|
212
|
+
next unless e.is_a?(Array) && e.first.is_a?(Symbol)
|
213
|
+
new_sym, new_id = "_#{sym}_#{rule_seq}".to_sym, "#{id}.#{rule_seq}"
|
214
|
+
rule_seq += 1
|
215
|
+
this.expr[index] = new_sym
|
216
|
+
new_rule = Rule.new(new_sym, new_id, e, :ebnf => @ebnf)
|
217
|
+
new_rules << new_rule
|
218
|
+
end
|
219
|
+
|
220
|
+
# Return new rules after recursively applying #to_bnf
|
221
|
+
new_rules = new_rules.map {|r| r.to_bnf}.flatten
|
222
|
+
elsif expr.first == :opt
|
223
|
+
# * Transform (a rule (opt b)) into (a rule (alt _empty "foo"))
|
224
|
+
new_rules = Rule.new(sym, id, [:alt, :_empty, expr.last], :ebnf => @ebnf).to_bnf
|
225
|
+
elsif expr.first == :star
|
226
|
+
# * Transform (a rule (star b)) into (a rule (alt _empty (seq b a)))
|
227
|
+
new_rules = [Rule.new(sym, id, [:alt, :_empty, "_#{sym}_star".to_sym], :ebnf => @ebnf)] +
|
228
|
+
Rule.new("_#{sym}_star".to_sym, "#{id}*", [:seq, expr.last, sym], :ebnf => @ebnf).to_bnf
|
229
|
+
elsif expr.first == :plus
|
230
|
+
# * Transform (a rule (plus b)) into (a rule (seq b (star b)
|
231
|
+
new_rules = Rule.new(sym, id, [:seq, expr.last, [:star, expr.last]], :ebnf => @ebnf).to_bnf
|
232
|
+
else
|
233
|
+
# Otherwise, no further transformation necessary
|
234
|
+
new_rules << self
|
235
|
+
end
|
236
|
+
|
237
|
+
return new_rules
|
238
|
+
end
|
239
|
+
|
240
|
+
# Does this rule start with a sym? It does if expr is that sym,
|
241
|
+
# expr starts with alt and contains that sym, or
|
242
|
+
# expr starts with seq and the next element is that sym
|
243
|
+
# @param [Symbol, class] sym
|
244
|
+
# Symbol matching any start element, or if it is String, any start element which is a String
|
245
|
+
# @return [Array<Symbol, String>] list of symbol (singular), or strings which are start symbol, or nil if there are none
|
246
|
+
def starts_with(sym)
|
247
|
+
if seq? && sym === (v = expr.fetch(1, nil))
|
248
|
+
[v]
|
249
|
+
elsif alt? && expr.any? {|e| sym === e}
|
250
|
+
expr.select {|e| sym === e}
|
251
|
+
else
|
252
|
+
nil
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
# Add terminal as proceding this rule
|
257
|
+
# @param [Array<Rule>] terminals
|
258
|
+
# @return [Integer] if number of terminals added
|
259
|
+
def add_first(terminals)
|
260
|
+
@first ||= []
|
261
|
+
terminals -= @first # Remove those already in first
|
262
|
+
@first += terminals
|
263
|
+
terminals.length
|
264
|
+
end
|
265
|
+
|
266
|
+
# Add terminal as following this rule. Don't add _eps as a follow
|
267
|
+
#
|
268
|
+
# @param [Array<Rule>] terminals
|
269
|
+
# @return [Integer] if number of terminals added
|
270
|
+
def add_follow(terminals)
|
271
|
+
terminals -= @follow || [] # Remove those already in first
|
272
|
+
terminals -= [:_eps] # Special case, don't add empty string as a follow terminal
|
273
|
+
unless terminals.empty?
|
274
|
+
@follow ||= []
|
275
|
+
@follow += terminals
|
276
|
+
end
|
277
|
+
terminals.length
|
278
|
+
end
|
279
|
+
|
280
|
+
# Is this rule of the form (seq ...)?
|
281
|
+
def seq?
|
282
|
+
expr.is_a?(Array) && expr.first == :seq
|
283
|
+
end
|
284
|
+
|
285
|
+
# Is this rule of the form (alt ...)?
|
286
|
+
def alt?
|
287
|
+
expr.is_a?(Array) && expr.first == :alt
|
288
|
+
end
|
289
|
+
|
290
|
+
def inspect
|
291
|
+
"#<EBNF::Rule:#{object_id} " +
|
292
|
+
{:sym => sym, :id => id, :kind => kind, :expr => expr}.inspect +
|
293
|
+
">"
|
294
|
+
end
|
295
|
+
|
296
|
+
# Two rules are equal if they have the same {#sym}, {#kind} and {#expr}
|
297
|
+
# @param [Rule] other
|
298
|
+
# @return [Boolean]
|
299
|
+
def ==(other)
|
300
|
+
sym == other.sym &&
|
301
|
+
kind == other.kind &&
|
302
|
+
expr == other.expr
|
303
|
+
end
|
304
|
+
|
305
|
+
# Two rules are equivalent if they have the same {#expr}
|
306
|
+
# @param [Rule] other
|
307
|
+
# @return [Boolean]
|
308
|
+
def equivalent?(other)
|
309
|
+
expr == other.expr
|
310
|
+
end
|
311
|
+
|
312
|
+
# Rewrite the rule substituting src_rule for dst_rule wherever
|
313
|
+
# it is used in the production (first level only).
|
314
|
+
# @param [Rule] src_rule
|
315
|
+
# @param [Rule] dst_rule
|
316
|
+
# @return [Rule]
|
317
|
+
def rewrite(src_rule, dst_rule)
|
318
|
+
case @expr
|
319
|
+
when Array
|
320
|
+
@expr = @expr.map {|e| e == src_rule.sym ? dst_rule.sym : e}
|
321
|
+
else
|
322
|
+
@expr = dst_rule.sym if @expr == src_rule.sym
|
323
|
+
end
|
324
|
+
self
|
325
|
+
end
|
326
|
+
|
327
|
+
# Rules compare using their ids
|
328
|
+
def <=>(other)
|
329
|
+
if id.to_i == other.id.to_i
|
330
|
+
id <=> other.id
|
331
|
+
else
|
332
|
+
id.to_i <=> other.id.to_i
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
private
|
337
|
+
def ttl_expr(expr, pfx, depth, is_obj = true)
|
338
|
+
indent = ' ' * depth
|
339
|
+
@ebnf.debug("ttl_expr", :depth => depth) {expr.inspect}
|
340
|
+
op = expr.shift if expr.is_a?(Array)
|
341
|
+
statements = []
|
342
|
+
|
343
|
+
if is_obj
|
344
|
+
bra, ket = "[ ", " ]"
|
345
|
+
else
|
346
|
+
bra = ket = ''
|
347
|
+
end
|
348
|
+
|
349
|
+
case op
|
350
|
+
when :seq, :alt, :diff
|
351
|
+
statements << %{#{indent}#{bra}#{pfx}:#{op} (}
|
352
|
+
expr.each {|a| statements += ttl_expr(a, pfx, depth + 1)}
|
353
|
+
statements << %{#{indent} )#{ket}}
|
354
|
+
when :opt, :plus, :star
|
355
|
+
statements << %{#{indent}#{bra}#{pfx}:#{op} }
|
356
|
+
statements += ttl_expr(expr.first, pfx, depth + 1)
|
357
|
+
statements << %{#{indent} #{ket}} unless ket.empty?
|
358
|
+
when :_empty, :_eps, :_empty
|
359
|
+
statements << %{#{indent}"g:#{op.to_s[1..-1]}"}
|
360
|
+
when :"'"
|
361
|
+
statements << %{#{indent}"#{esc(expr)}"}
|
362
|
+
when :range
|
363
|
+
statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
|
364
|
+
when :hex
|
365
|
+
raise "didn't expect \" in expr" if expr.include?(:'"')
|
366
|
+
statements << %{#{indent}#{bra} re:matches #{cclass(expr.first).inspect} #{ket}}
|
367
|
+
else
|
368
|
+
if is_obj
|
369
|
+
statements << %{#{indent}#{expr.inspect}}
|
370
|
+
else
|
371
|
+
statements << %{#{indent}g:seq ( #{expr.inspect} )}
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
statements.last << " ." unless is_obj
|
376
|
+
@ebnf.debug("statements", :depth => depth) {statements.join("\n")}
|
377
|
+
statements
|
378
|
+
end
|
379
|
+
|
380
|
+
##
|
381
|
+
# turn an XML BNF character class into an N3 literal for that
|
382
|
+
# character class (less the outer quote marks)
|
383
|
+
#
|
384
|
+
# >>> cclass("^<>'{}|^`")
|
385
|
+
# "[^<>'{}|^`]"
|
386
|
+
# >>> cclass("#x0300-#x036F")
|
387
|
+
# "[\\u0300-\\u036F]"
|
388
|
+
# >>> cclass("#xC0-#xD6")
|
389
|
+
# "[\\u00C0-\\u00D6]"
|
390
|
+
# >>> cclass("#x370-#x37D")
|
391
|
+
# "[\\u0370-\\u037D]"
|
392
|
+
#
|
393
|
+
# as in: ECHAR ::= '\' [tbnrf\"']
|
394
|
+
# >>> cclass("tbnrf\\\"'")
|
395
|
+
# 'tbnrf\\\\\\"\''
|
396
|
+
#
|
397
|
+
# >>> cclass("^#x22#x5C#x0A#x0D")
|
398
|
+
# '^\\u0022\\\\\\u005C\\u000A\\u000D'
|
399
|
+
def cclass(txt)
|
400
|
+
'[' +
|
401
|
+
txt.gsub(/\#x[0-9a-fA-F]+/) do |hx|
|
402
|
+
hx = hx[2..-1]
|
403
|
+
if hx.length <= 4
|
404
|
+
"\\u#{'0' * (4 - hx.length)}#{hx}"
|
405
|
+
elsif hx.length <= 8
|
406
|
+
"\\U#{'0' * (8 - hx.length)}#{hx}"
|
407
|
+
end
|
408
|
+
end +
|
409
|
+
']'
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
# Abstract syntax tree from parse
|
414
|
+
attr_reader :ast
|
415
|
+
|
416
|
+
# Parse the string or file input generating an abstract syntax tree
|
417
|
+
# in S-Expressions (similar to SPARQL SSE)
|
418
|
+
#
|
419
|
+
# @param [#read, #to_s] input
|
420
|
+
# @param [Hash{Symbol => Object}] options
|
421
|
+
# @option options [Boolean, Array] :debug
|
422
|
+
# Output debug information to an array or STDOUT.
|
423
|
+
def initialize(input, options = {})
|
424
|
+
@options = options
|
425
|
+
@lineno, @depth = 1, 0
|
426
|
+
terminal = false
|
427
|
+
@ast = []
|
428
|
+
|
429
|
+
input = input.respond_to?(:read) ? input.read : input.to_s
|
430
|
+
scanner = StringScanner.new(input)
|
431
|
+
|
432
|
+
eachRule(scanner) do |r|
|
433
|
+
debug("rule string") {r.inspect}
|
434
|
+
case r
|
435
|
+
when /^@terminals/
|
436
|
+
# Switch mode to parsing terminals
|
437
|
+
terminal = true
|
438
|
+
when /^@pass\s*(.*)$/m
|
439
|
+
rule = depth {ruleParts("[0] " + r)}
|
440
|
+
rule.kind = :pass
|
441
|
+
rule.orig = r
|
442
|
+
@ast << rule
|
443
|
+
else
|
444
|
+
rule = depth {ruleParts(r)}
|
445
|
+
|
446
|
+
rule.kind = :terminal if terminal # Override after we've parsed @terminals
|
447
|
+
rule.orig = r
|
448
|
+
@ast << rule
|
449
|
+
end
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
##
|
454
|
+
# Transform EBNF Rule set to BNF:
|
455
|
+
#
|
456
|
+
# * Add rule [0] (_empty rule (seq))
|
457
|
+
# * Transform each rule into a set of rules that are just BNF, using {Rule#to_bnf}.
|
458
|
+
# @return [ENBF] self
|
459
|
+
def make_bnf
|
460
|
+
new_ast = [Rule.new(:_empty, "0", [:seq], :kind => :rule)]
|
461
|
+
|
462
|
+
ast.each do |rule|
|
463
|
+
debug("make_bnf") {"expand from: #{rule.inspect}"}
|
464
|
+
new_rules = rule.to_bnf
|
465
|
+
debug(" => ") {new_rules.map(&:sym).join(', ')}
|
466
|
+
new_ast += new_rules
|
467
|
+
end
|
468
|
+
|
469
|
+
# Consolodate equivalent terminal rules
|
470
|
+
to_rewrite = {}
|
471
|
+
new_ast.select {|r| r.kind == :terminal}.each do |src_rule|
|
472
|
+
new_ast.select {|r| r.kind == :terminal}.each do |dst_rule|
|
473
|
+
if src_rule.equivalent?(dst_rule) && src_rule != dst_rule
|
474
|
+
debug("make_bnf") {"equivalent rules: #{src_rule.inspect} and #{dst_rule.inspect}"}
|
475
|
+
(to_rewrite[src_rule] ||= []) << dst_rule
|
476
|
+
end
|
477
|
+
end
|
478
|
+
end
|
479
|
+
|
480
|
+
# Replace references to equivalent rules with canonical rule
|
481
|
+
to_rewrite.each do |src_rule, dst_rules|
|
482
|
+
dst_rules.each do |dst_rule|
|
483
|
+
new_ast.each do |mod_rule|
|
484
|
+
debug("make_bnf") {"rewrite #{mod_rule.inspect} from #{dst_rule.sym} to #{src_rule.sym}"}
|
485
|
+
mod_rule.rewrite(dst_rule, src_rule)
|
486
|
+
end
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
# AST now has just rewritten rules
|
491
|
+
compacted_ast = new_ast - to_rewrite.values.flatten.compact
|
492
|
+
|
493
|
+
# Sort AST by number
|
494
|
+
@ast = compacted_ast
|
495
|
+
|
496
|
+
self
|
497
|
+
end
|
498
|
+
|
499
|
+
# Iterate over each rule or terminal
|
500
|
+
# @param [:termina, :rule] kind
|
501
|
+
# @yield rule
|
502
|
+
# @yieldparam [Rule] rule
|
503
|
+
def each(kind, &block)
|
504
|
+
ast.each {|r| block.call(r) if r.kind == kind}
|
505
|
+
end
|
506
|
+
|
507
|
+
##
|
508
|
+
# Create first/follow for each rule using techniques defined for LL(1) parsers.
|
509
|
+
#
|
510
|
+
# @return [EBNF] self
|
511
|
+
# @see http://en.wikipedia.org/wiki/LL_parser#Constructing_an_LL.281.29_parsing_table
|
512
|
+
# @param [Array<String>] starts
|
513
|
+
# Set of symbols which are start rules
|
514
|
+
def first_follow(starts)
|
515
|
+
# Add _eof to follow all start rules
|
516
|
+
starts.map(&:to_sym).each do |sym|
|
517
|
+
rule = ast.detect {|r| r.sym == sym}
|
518
|
+
raise "No rule found for start symbol #{sym}" unless rule
|
519
|
+
rule.add_follow([:_eof])
|
520
|
+
rule.start = true
|
521
|
+
end
|
522
|
+
|
523
|
+
# Comprehnsion rule, create shorter versions of all non-terminal sequences
|
524
|
+
comprehensions = []
|
525
|
+
begin
|
526
|
+
comprehensions = []
|
527
|
+
ast.select {|r| r.seq? && r.kind == :rule && r.expr.length > 2}.each do |rule|
|
528
|
+
new_expr = rule.expr[2..-1].unshift(:seq)
|
529
|
+
unless ast.any? {|r| r.expr == new_expr}
|
530
|
+
debug("first_follow") {"add comprehension rule for #{rule.sym} => #{new_expr.inspect}"}
|
531
|
+
new_rule = Rule.new("_#{rule.sym}_comp".to_sym, "#{rule.id}.comp", new_expr)
|
532
|
+
comprehensions << new_rule
|
533
|
+
end
|
534
|
+
end
|
535
|
+
|
536
|
+
@ast += comprehensions
|
537
|
+
debug("first_follow") {"comprehensions #{comprehensions.length}"}
|
538
|
+
end while !comprehensions.empty?
|
539
|
+
|
540
|
+
# Fi(a w' ) = { a } for every terminal a
|
541
|
+
# For each rule who's expr's first element of a seq a terminal, or having any element of alt a terminal, add that terminal to the first set for this rule
|
542
|
+
each(:rule) do |rule|
|
543
|
+
each(:terminal) do |terminal|
|
544
|
+
rule.add_first([terminal.sym]) if rule.starts_with(terminal.sym)
|
545
|
+
end
|
546
|
+
|
547
|
+
# Add strings to first for strings which are start elements
|
548
|
+
start_strs = rule.starts_with(String)
|
549
|
+
rule.add_first(start_strs) if start_strs
|
550
|
+
end
|
551
|
+
|
552
|
+
# # Fi(ε) = { ε }
|
553
|
+
# Add _eps as a first of _empty
|
554
|
+
empty = ast.detect {|r| r.sym == :_empty}
|
555
|
+
empty.add_first([:_eps])
|
556
|
+
|
557
|
+
# Loop until no more first elements are added
|
558
|
+
firsts, follows = 0, 0
|
559
|
+
begin
|
560
|
+
firsts, follows = 0, 0
|
561
|
+
each(:rule) do |rule|
|
562
|
+
each(:rule) do |first_rule|
|
563
|
+
next if first_rule == rule || first_rule.first.nil?
|
564
|
+
|
565
|
+
# Fi(A w' ) = Fi(A) for every nonterminal A with ε not in Fi(A)
|
566
|
+
# For each rule that starts with another rule having firsts, add the firsts of that rule to this rule, unless it already has those terminals in its first
|
567
|
+
if rule.starts_with(first_rule.sym)
|
568
|
+
depth {debug("FF.1") {"add first #{first_rule.first.inspect} to #{rule.sym}"}}
|
569
|
+
firsts += rule.add_first(first_rule.first)
|
570
|
+
end
|
571
|
+
|
572
|
+
# Fi(A w' ) = Fi(A) \ { ε } ∪ Fi(w' ) for every nonterminal A with ε in Fi(A)
|
573
|
+
# For each rule starting with eps, add the terminals for the comprehension of this rule
|
574
|
+
if rule.seq? &&
|
575
|
+
rule.expr.fetch(1, nil) == first_rule &&
|
576
|
+
first_rule.first.include?(:_eps) &&
|
577
|
+
(comp = find_comp(rule))
|
578
|
+
|
579
|
+
depth {debug("FF.2") {"add first #{first_rule.first.inspect} to #{comp.sym}"}}
|
580
|
+
firsts += comp.add_first(first_rule.first)
|
581
|
+
end
|
582
|
+
end
|
583
|
+
|
584
|
+
# Only run these rules if the rule is a sequence having two or more elements, whos first element is also a sequence and first_rule is the comprehension of rule
|
585
|
+
if rule.seq? && (comp = find_comp(rule))
|
586
|
+
#if there is a rule of the form Aj → wAiw' , then
|
587
|
+
#
|
588
|
+
if (ai = find_rule(rule.expr[1])) && ai.kind == :rule && comp.first
|
589
|
+
# * if the terminal a is in Fi(w' ), then add a to Fo(Ai)
|
590
|
+
#
|
591
|
+
# Add follow terminals based on the first terminals
|
592
|
+
# of a comprehension of this rule (having the same
|
593
|
+
# sequence other than the first rule in the sequence)
|
594
|
+
#
|
595
|
+
# @example
|
596
|
+
# rule: (seq a b c)
|
597
|
+
# first_rule: (seq b c)
|
598
|
+
# if first_rule.first == [T]
|
599
|
+
# => a.follow += [T]
|
600
|
+
depth {debug("FF.3") {"add follow #{comp.first.inspect} to #{ai.sym}"}}
|
601
|
+
follows += ai.add_follow(comp.first)
|
602
|
+
end
|
603
|
+
|
604
|
+
# Follows of a rule are also follows of the comprehension of the rule.
|
605
|
+
if rule.follow
|
606
|
+
depth {debug("FF.4") {"add follow #{rule.follow.inspect} to #{comp.sym}"}}
|
607
|
+
follows += comp.add_follow(rule.follow)
|
608
|
+
end
|
609
|
+
|
610
|
+
# * if ε is in Fi(w' ), then add Fo(Aj) to Fo(Ai)
|
611
|
+
#
|
612
|
+
# If the comprehension of a sequence has an _eps first, then the follows of the rule also become the follows of the first member of the rule
|
613
|
+
if comp.first && comp.first.include?(:_eps) && rule.first
|
614
|
+
member = find_rule(rule.expr.fetch(1, nil))
|
615
|
+
depth {debug("FF.4") {"add follow #{rule.follow.inspect} to #{member.sym}"}}
|
616
|
+
follows += member.add_follow(rule.first) if member.kind == :rule
|
617
|
+
end
|
618
|
+
end
|
619
|
+
|
620
|
+
# Follows of a rule are also follows of the last production in the rule
|
621
|
+
if rule.seq? && rule.follow &&
|
622
|
+
(member = find_rule(rule.expr.last)) &&
|
623
|
+
member.kind == :rule
|
624
|
+
|
625
|
+
depth {debug("FF.5") {"add follow #{rule.follow.inspect} to #{member.sym}"}}
|
626
|
+
follows += member.add_follow(rule.follow)
|
627
|
+
end
|
628
|
+
|
629
|
+
# For alts, anything that follows the rule follows each member of the rule
|
630
|
+
if rule.alt? && rule.follow
|
631
|
+
rule.expr[1..-1].map {|s| find_rule(s)}.each do |mem|
|
632
|
+
if mem && mem.kind == :rule
|
633
|
+
depth {debug("FF.6") {"add follow #{rule.first.inspect} to #{mem.sym}"}}
|
634
|
+
follows += mem.add_follow(rule.follow)
|
635
|
+
end
|
636
|
+
end
|
637
|
+
end
|
638
|
+
end
|
639
|
+
|
640
|
+
debug("first_follow") {"firsts #{firsts}, follows #{follows}"}
|
641
|
+
end while (firsts + follows) > 0
|
642
|
+
end
|
643
|
+
|
644
|
+
##
|
645
|
+
# Write out parsed syntax string as an S-Expression
|
646
|
+
# @return [String]
|
647
|
+
def to_sxp
|
648
|
+
begin
|
649
|
+
require 'sxp'
|
650
|
+
SXP::Generator.string(ast.sort)
|
651
|
+
rescue LoadError
|
652
|
+
ast.to_sxp
|
653
|
+
end
|
654
|
+
end
|
655
|
+
def to_s; to_sxp; end
|
656
|
+
|
657
|
+
def dup
|
658
|
+
new_obj = super
|
659
|
+
new_obj.instance_variable_set(:@ast, @ast.dup)
|
660
|
+
new_obj
|
661
|
+
end
|
662
|
+
|
663
|
+
##
|
664
|
+
# Find a rule given a symbol
|
665
|
+
# @param [Symbol] sym
|
666
|
+
# @return [Rule]
|
667
|
+
def find_rule(sym)
|
668
|
+
(@find ||= {})[sym] ||= ast.detect {|r| r.sym == sym}
|
669
|
+
end
|
670
|
+
|
671
|
+
##
|
672
|
+
# Find the comprehension of a rule
|
673
|
+
# Comprehensions are created in {#first_follow} then the rule is a sequence with more than 1 element. They are named automatically as "_sym_comp" where "sym" is the symbol of the source rule
|
674
|
+
# @param [Rule] source
|
675
|
+
# @return [Rule]
|
676
|
+
def find_comp(source)
|
677
|
+
(@comp ||= {})[source.sym] ||= source.seq? && source.expr.length > 2 && find_rule("_#{source.sym}_comp".to_sym)
|
678
|
+
end
|
679
|
+
|
680
|
+
##
|
681
|
+
# Write out syntax tree as Turtle
|
682
|
+
# @param [String] prefix for language
|
683
|
+
# @param [String] ns URI for language
|
684
|
+
# @return [String]
|
685
|
+
def to_ttl(prefix, ns)
|
686
|
+
unless ast.empty?
|
687
|
+
[
|
688
|
+
"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.",
|
689
|
+
"@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.",
|
690
|
+
"@prefix #{prefix}: <#{ns}>.",
|
691
|
+
"@prefix : <#{ns}>.",
|
692
|
+
"@prefix re: <http://www.w3.org/2000/10/swap/grammar/regex#>.",
|
693
|
+
"@prefix g: <http://www.w3.org/2000/10/swap/grammar/ebnf#>.",
|
694
|
+
"",
|
695
|
+
":language rdfs:isDefinedBy <>; g:start :#{ast.first.id}.",
|
696
|
+
"",
|
697
|
+
]
|
698
|
+
end.join("\n") +
|
699
|
+
|
700
|
+
ast.sort.
|
701
|
+
select {|a| [:rule, :terminal].include?(a.kind)}.
|
702
|
+
map(&:to_ttl).
|
703
|
+
join("\n")
|
704
|
+
end
|
705
|
+
|
706
|
+
##
|
707
|
+
# Iterate over rule strings.
|
708
|
+
# a line that starts with '\[' or '@' starts a new rule
|
709
|
+
#
|
710
|
+
# @param [StringScanner] scanner
|
711
|
+
# @yield rule_string
|
712
|
+
# @yieldparam [String] rule_string
|
713
|
+
def eachRule(scanner)
|
714
|
+
cur_lineno = 1
|
715
|
+
r = ''
|
716
|
+
until scanner.eos?
|
717
|
+
case
|
718
|
+
when s = scanner.scan(%r(\s+)m)
|
719
|
+
# Eat whitespace
|
720
|
+
cur_lineno += s.count("\n")
|
721
|
+
#debug("eachRule(ws)") { "[#{cur_lineno}] #{s.inspect}" }
|
722
|
+
when s = scanner.scan(%r(/\*([^\*]|\*[^\/])*\*/)m)
|
723
|
+
# Eat comments
|
724
|
+
cur_lineno += s.count("\n")
|
725
|
+
debug("eachRule(comment)") { "[#{cur_lineno}] #{s.inspect}" }
|
726
|
+
when s = scanner.scan(%r(^@terminals))
|
727
|
+
#debug("eachRule(@terminals)") { "[#{cur_lineno}] #{s.inspect}" }
|
728
|
+
yield(r) unless r.empty?
|
729
|
+
@lineno = cur_lineno
|
730
|
+
yield(s)
|
731
|
+
r = ''
|
732
|
+
when s = scanner.scan(/@pass/)
|
733
|
+
# Found rule start, if we've already collected a rule, yield it
|
734
|
+
#debug("eachRule(@pass)") { "[#{cur_lineno}] #{s.inspect}" }
|
735
|
+
yield r unless r.empty?
|
736
|
+
@lineno = cur_lineno
|
737
|
+
r = s
|
738
|
+
when s = scanner.scan(/\[(?=\w+\])/)
|
739
|
+
# Found rule start, if we've already collected a rule, yield it
|
740
|
+
yield r unless r.empty?
|
741
|
+
#debug("eachRule(rule)") { "[#{cur_lineno}] #{s.inspect}" }
|
742
|
+
@lineno = cur_lineno
|
743
|
+
r = s
|
744
|
+
else
|
745
|
+
# Collect until end of line, or start of comment
|
746
|
+
s = scanner.scan_until(%r((?:/\*)|$)m)
|
747
|
+
cur_lineno += s.count("\n")
|
748
|
+
#debug("eachRule(rest)") { "[#{cur_lineno}] #{s.inspect}" }
|
749
|
+
r += s
|
750
|
+
end
|
751
|
+
end
|
752
|
+
yield r unless r.empty?
|
753
|
+
end
|
754
|
+
|
755
|
+
##
|
756
|
+
# Parse a rule into a rule number, a symbol and an expression
|
757
|
+
#
|
758
|
+
# @param [String] rule
|
759
|
+
# @return [Rule]
|
760
|
+
def ruleParts(rule)
|
761
|
+
num_sym, expr = rule.split('::=', 2).map(&:strip)
|
762
|
+
num, sym = num_sym.split(']', 2).map(&:strip)
|
763
|
+
num = num[1..-1]
|
764
|
+
r = Rule.new(sym && sym.to_sym, num, ebnf(expr).first, :ebnf => self)
|
765
|
+
debug("ruleParts") { r.inspect }
|
766
|
+
r
|
767
|
+
end
|
768
|
+
|
769
|
+
##
|
770
|
+
# Parse a string into an expression tree and a remaining string
|
771
|
+
#
|
772
|
+
# @example
|
773
|
+
# >>> ebnf("a b c")
|
774
|
+
# ((seq, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '')
|
775
|
+
#
|
776
|
+
# >>> ebnf("a? b+ c*")
|
777
|
+
# ((seq, \[(opt, ('id', 'a')), (plus, ('id', 'b')), ('*', ('id', 'c'))\]), '')
|
778
|
+
#
|
779
|
+
# >>> ebnf(" | x xlist")
|
780
|
+
# ((alt, \[(seq, \[\]), (seq, \[('id', 'x'), ('id', 'xlist')\])\]), '')
|
781
|
+
#
|
782
|
+
# >>> ebnf("a | (b - c)")
|
783
|
+
# ((alt, \[('id', 'a'), (diff, \[('id', 'b'), ('id', 'c')\])\]), '')
|
784
|
+
#
|
785
|
+
# >>> ebnf("a b | c d")
|
786
|
+
# ((alt, \[(seq, \[('id', 'a'), ('id', 'b')\]), (seq, \[('id', 'c'), ('id', 'd')\])\]), '')
|
787
|
+
#
|
788
|
+
# >>> ebnf("a | b | c")
|
789
|
+
# ((alt, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '')
|
790
|
+
#
|
791
|
+
# >>> ebnf("a) b c")
|
792
|
+
# (('id', 'a'), ' b c')
|
793
|
+
#
|
794
|
+
# >>> ebnf("BaseDecl? PrefixDecl*")
|
795
|
+
# ((seq, \[(opt, ('id', 'BaseDecl')), ('*', ('id', 'PrefixDecl'))\]), '')
|
796
|
+
#
|
797
|
+
# >>> ebnf("NCCHAR1 | diff | [0-9] | #x00B7 | [#x0300-#x036F] | \[#x203F-#x2040\]")
|
798
|
+
# ((alt, \[('id', 'NCCHAR1'), ("'", diff), (range, '0-9'), (hex, '#x00B7'), (range, '#x0300-#x036F'), (range, '#x203F-#x2040')\]), '')
|
799
|
+
#
|
800
|
+
# @param [String] s
|
801
|
+
# @return [Array]
|
802
|
+
def ebnf(s)
|
803
|
+
debug("ebnf") {"(#{s.inspect})"}
|
804
|
+
e, s = depth {alt(s)}
|
805
|
+
debug {"=> alt returned #{[e, s].inspect}"}
|
806
|
+
unless s.empty?
|
807
|
+
t, ss = depth {terminal(s)}
|
808
|
+
debug {"=> terminal returned #{[t, ss].inspect}"}
|
809
|
+
return [e, ss] if t.is_a?(Array) && t.first == :")"
|
810
|
+
end
|
811
|
+
[e, s]
|
812
|
+
end
|
813
|
+
|
814
|
+
##
|
815
|
+
# Parse alt
|
816
|
+
# >>> alt("a | b | c")
|
817
|
+
# ((alt, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '')
|
818
|
+
# @param [String] s
|
819
|
+
# @return [Array]
|
820
|
+
def alt(s)
|
821
|
+
debug("alt") {"(#{s.inspect})"}
|
822
|
+
args = []
|
823
|
+
while !s.empty?
|
824
|
+
e, s = depth {seq(s)}
|
825
|
+
debug {"=> seq returned #{[e, s].inspect}"}
|
826
|
+
if e.to_s.empty?
|
827
|
+
break unless args.empty?
|
828
|
+
e = [:seq, []] # empty sequence
|
829
|
+
end
|
830
|
+
args << e
|
831
|
+
unless s.empty?
|
832
|
+
t, ss = depth {terminal(s)}
|
833
|
+
break unless t[0] == :alt
|
834
|
+
s = ss
|
835
|
+
end
|
836
|
+
end
|
837
|
+
args.length > 1 ? [args.unshift(:alt), s] : [e, s]
|
838
|
+
end
|
839
|
+
|
840
|
+
##
|
841
|
+
# parse seq
|
842
|
+
#
|
843
|
+
# >>> seq("a b c")
|
844
|
+
# ((seq, \[('id', 'a'), ('id', 'b'), ('id', 'c')\]), '')
|
845
|
+
#
|
846
|
+
# >>> seq("a b? c")
|
847
|
+
# ((seq, \[('id', 'a'), (opt, ('id', 'b')), ('id', 'c')\]), '')
|
848
|
+
def seq(s)
|
849
|
+
debug("seq") {"(#{s.inspect})"}
|
850
|
+
args = []
|
851
|
+
while !s.empty?
|
852
|
+
e, ss = depth {diff(s)}
|
853
|
+
debug {"=> diff returned #{[e, ss].inspect}"}
|
854
|
+
unless e.to_s.empty?
|
855
|
+
args << e
|
856
|
+
s = ss
|
857
|
+
else
|
858
|
+
break;
|
859
|
+
end
|
860
|
+
end
|
861
|
+
if args.length > 1
|
862
|
+
[args.unshift(:seq), s]
|
863
|
+
elsif args.length == 1
|
864
|
+
args + [s]
|
865
|
+
else
|
866
|
+
["", s]
|
867
|
+
end
|
868
|
+
end
|
869
|
+
|
870
|
+
##
|
871
|
+
# parse diff
|
872
|
+
#
|
873
|
+
# >>> diff("a - b")
|
874
|
+
# ((diff, \[('id', 'a'), ('id', 'b')\]), '')
|
875
|
+
def diff(s)
|
876
|
+
debug("diff") {"(#{s.inspect})"}
|
877
|
+
e1, s = depth {postfix(s)}
|
878
|
+
debug {"=> postfix returned #{[e1, s].inspect}"}
|
879
|
+
unless e1.to_s.empty?
|
880
|
+
unless s.empty?
|
881
|
+
t, ss = depth {terminal(s)}
|
882
|
+
debug {"diff #{[t, ss].inspect}"}
|
883
|
+
if t.is_a?(Array) && t.first == :diff
|
884
|
+
s = ss
|
885
|
+
e2, s = primary(s)
|
886
|
+
unless e2.to_s.empty?
|
887
|
+
return [[:diff, e1, e2], s]
|
888
|
+
else
|
889
|
+
raise "Syntax Error"
|
890
|
+
end
|
891
|
+
end
|
892
|
+
end
|
893
|
+
end
|
894
|
+
[e1, s]
|
895
|
+
end
|
896
|
+
|
897
|
+
##
|
898
|
+
# parse postfix
|
899
|
+
#
|
900
|
+
# >>> postfix("a b c")
|
901
|
+
# (('id', 'a'), ' b c')
|
902
|
+
#
|
903
|
+
# >>> postfix("a? b c")
|
904
|
+
# ((opt, ('id', 'a')), ' b c')
|
905
|
+
def postfix(s)
|
906
|
+
debug("postfix") {"(#{s.inspect})"}
|
907
|
+
e, s = depth {primary(s)}
|
908
|
+
debug {"=> primary returned #{[e, s].inspect}"}
|
909
|
+
return ["", s] if e.to_s.empty?
|
910
|
+
if !s.empty?
|
911
|
+
t, ss = depth {terminal(s)}
|
912
|
+
debug {"=> #{[t, ss].inspect}"}
|
913
|
+
if t.is_a?(Array) && [:opt, :star, :plus].include?(t.first)
|
914
|
+
return [[t.first, e], ss]
|
915
|
+
end
|
916
|
+
end
|
917
|
+
[e, s]
|
918
|
+
end
|
919
|
+
|
920
|
+
##
|
921
|
+
# parse primary
|
922
|
+
#
|
923
|
+
# >>> primary("a b c")
|
924
|
+
# (('id', 'a'), ' b c')
|
925
|
+
def primary(s)
|
926
|
+
debug("primary") {"(#{s.inspect})"}
|
927
|
+
t, s = depth {terminal(s)}
|
928
|
+
debug {"=> terminal returned #{[t, s].inspect}"}
|
929
|
+
if t.is_a?(Symbol) || t.is_a?(String)
|
930
|
+
[t, s]
|
931
|
+
elsif %w(range hex).map(&:to_sym).include?(t.first)
|
932
|
+
[t, s]
|
933
|
+
elsif t.first == :"("
|
934
|
+
e, s = depth {ebnf(s)}
|
935
|
+
debug {"=> ebnf returned #{[e, s].inspect}"}
|
936
|
+
[e, s]
|
937
|
+
else
|
938
|
+
["", s]
|
939
|
+
end
|
940
|
+
end
|
941
|
+
|
942
|
+
##
|
943
|
+
# parse one terminal; return the terminal and the remaining string
|
944
|
+
#
|
945
|
+
# A terminal is represented as a tuple whose 1st item gives the type;
|
946
|
+
# some types have additional info in the tuple.
|
947
|
+
#
|
948
|
+
# @example
|
949
|
+
# >>> terminal("'abc' def")
|
950
|
+
# (("'", 'abc'), ' def')
|
951
|
+
#
|
952
|
+
# >>> terminal("[0-9]")
|
953
|
+
# ((range, '0-9'), '')
|
954
|
+
# >>> terminal("#x00B7")
|
955
|
+
# ((hex, '#x00B7'), '')
|
956
|
+
# >>> terminal ("\[#x0300-#x036F\]")
|
957
|
+
# ((range, '#x0300-#x036F'), '')
|
958
|
+
# >>> terminal("\[^<>'{}|^`\]-\[#x00-#x20\]")
|
959
|
+
# ((range, "^<>'{}|^`"), '-\[#x00-#x20\]')
|
960
|
+
def terminal(s)
|
961
|
+
s = s.strip
|
962
|
+
case m = s[0,1]
|
963
|
+
when '"', "'"
|
964
|
+
l, s = s[1..-1].split(m, 2)
|
965
|
+
[l, s]
|
966
|
+
when '['
|
967
|
+
l, s = s[1..-1].split(']', 2)
|
968
|
+
[[:range, l], s]
|
969
|
+
when '#'
|
970
|
+
s.match(/(#\w+)(.*)$/)
|
971
|
+
l, s = $1, $2
|
972
|
+
[[:hex, l], s]
|
973
|
+
when /[[:alpha:]]/
|
974
|
+
s.match(/(\w+)(.*)$/)
|
975
|
+
l, s = $1, $2
|
976
|
+
[l.to_sym, s]
|
977
|
+
when '@'
|
978
|
+
s.match(/@(#\w+)(.*)$/)
|
979
|
+
l, s = $1, $2
|
980
|
+
[[:"@", l], s]
|
981
|
+
when '-'
|
982
|
+
[[:diff], s[1..-1]]
|
983
|
+
when '?'
|
984
|
+
[[:opt], s[1..-1]]
|
985
|
+
when '|'
|
986
|
+
[[:alt], s[1..-1]]
|
987
|
+
when '+'
|
988
|
+
[[:plus], s[1..-1]]
|
989
|
+
when '*'
|
990
|
+
[[:star], s[1..-1]]
|
991
|
+
when /[\(\)]/
|
992
|
+
[[m.to_sym], s[1..-1]]
|
993
|
+
else
|
994
|
+
raise "unrecognized terminal: #{s.inspect}"
|
995
|
+
end
|
996
|
+
end
|
997
|
+
|
998
|
+
def depth
|
999
|
+
@depth += 1
|
1000
|
+
ret = yield
|
1001
|
+
@depth -= 1
|
1002
|
+
ret
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
##
|
1006
|
+
# Progress output when debugging
|
1007
|
+
#
|
1008
|
+
# @overload debug(node, message)
|
1009
|
+
# @param [String] node relative location in input
|
1010
|
+
# @param [String] message ("")
|
1011
|
+
#
|
1012
|
+
# @overload debug(message)
|
1013
|
+
# @param [String] message ("")
|
1014
|
+
#
|
1015
|
+
# @yieldreturn [String] added to message
|
1016
|
+
def debug(*args)
|
1017
|
+
return unless @options[:debug]
|
1018
|
+
options = args.last.is_a?(Hash) ? args.pop : {}
|
1019
|
+
depth = options[:depth] || @depth
|
1020
|
+
message = args.pop
|
1021
|
+
message = message.call if message.is_a?(Proc)
|
1022
|
+
args << message if message
|
1023
|
+
args << yield if block_given?
|
1024
|
+
message = "#{args.join(': ')}"
|
1025
|
+
str = "[#{@lineno}]#{' ' * depth}#{message}"
|
1026
|
+
@options[:debug] << str if @options[:debug].is_a?(Array)
|
1027
|
+
$stderr.puts(str) if @options[:debug] == true
|
1028
|
+
end
|
1029
|
+
end
|