ebnf 2.1.3 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d3b1d692ce9936bb68c8c89722a06b90b7cfd6c42231ff57d49780a98214a325
4
- data.tar.gz: da6430584d824a1d070d364879c17b0b3b0bd8c9b32b44af841cd039e153e7f6
3
+ metadata.gz: '08b2411d5c4d34425d00259126e0d6f55c086b2c60c74e8d3ddc6a099a60ec5e'
4
+ data.tar.gz: d8185780e437d3db9c2644d62f51d497b25be130d20b79d63e3101e222180408
5
5
  SHA512:
6
- metadata.gz: 4e0bacbdef9b82ecca13fca9babcf8fe643acb3afaad077b05aebfc237f542859a1e6bea67e6b2e776c2e81b51dd7900cd24e39d9712582de6959d5e65ecfa42
7
- data.tar.gz: 3001f1864c2cf8fade2856fb7d3383f4b1ac8519d2914e54cc55b8a461f6f208d41c3fa35539367bad104a74068cc868220255ce933f301894530eacb0abf51e
6
+ metadata.gz: b972788258b8261d6e59a093268f31be74d3db13535b0db63199aae9ed36b93602d6b8034439152ce62361797eb3990b747d33a55551baa01bd2d1a9aed6bf6f
7
+ data.tar.gz: cc3b0bb1ecd8c0f0e7135989e96bb826d35f1406ecc3482e88a00f77aaf1df0c8ab5c6f98cb11b37c2c83f77b2d9d762f227d20a0bec44a27cbe48616c31f4a4
data/README.md CHANGED
@@ -93,7 +93,7 @@ Inevitably while implementing a parser for some specific grammar, a developer wi
93
93
 
94
94
  The {EBNF::Writer} class can be used to write parsed grammars out, either as formatted text, or HTML. Because grammars are written from the Abstract Syntax Tree, represented as [S-Expressions][S-Expression], this provides a means of transforming between grammar formats (e.g., W3C [EBNF][] to [ABNF][]), although with some potential loss in semantic fidelity (case-insensitive string matching vs. case-sensitive matching).
95
95
 
96
- The formatted HTML results are designed to be appropriate for including in specifications. If the [Nokogumbo](https://rubygems.org/gems/nokogumbo) gem list available, the resulting HTML encoded grammar will also be validated.
96
+ The formatted HTML results are designed to be appropriate for including in specifications.
97
97
 
98
98
  ### Parser Errors
99
99
  On a parsing failure, and exception is raised with information that may be useful in determining the source of the error.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.3
1
+ 2.2.0
@@ -32,60 +32,12 @@ module EBNF::LL1
32
32
  # @see https://en.wikipedia.org/wiki/Lexical_analysis
33
33
  class Lexer
34
34
  include Enumerable
35
-
36
- ESCAPE_CHARS = {
37
- '\\t' => "\t", # \u0009 (tab)
38
- '\\n' => "\n", # \u000A (line feed)
39
- '\\r' => "\r", # \u000D (carriage return)
40
- '\\b' => "\b", # \u0008 (backspace)
41
- '\\f' => "\f", # \u000C (form feed)
42
- '\\"' => '"', # \u0022 (quotation mark, double quote mark)
43
- "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
44
- '\\\\' => '\\' # \u005C (backslash)
45
- }.freeze
46
- ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
47
- ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
48
- ECHAR = /\\./u.freeze # More liberal unescaping
49
- UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
35
+ include ::EBNF::Unescape
50
36
 
51
37
  ##
52
38
  # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
53
39
  attr_reader :whitespace
54
40
 
55
- ##
56
- # Returns a copy of the given `input` string with all `\uXXXX` and
57
- # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
58
- # unescaped UTF-8 character counterparts.
59
- #
60
- # @param [String] string
61
- # @return [String]
62
- # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
63
- def self.unescape_codepoints(string)
64
- string = string.dup
65
- string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
66
-
67
- # Decode \uXXXX and \UXXXXXXXX code points:
68
- string = string.gsub(UCHAR) do |c|
69
- s = [(c[2..-1]).hex].pack('U*')
70
- s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
71
- end
72
-
73
- string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
74
- string
75
- end
76
-
77
- ##
78
- # Returns a copy of the given `input` string with all string escape
79
- # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
80
- # character counterparts.
81
- #
82
- # @param [String] input
83
- # @return [String]
84
- # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
85
- def self.unescape_string(input)
86
- input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
87
- end
88
-
89
41
  ##
90
42
  # Tokenizes the given `input` string or stream.
91
43
  #
@@ -338,7 +290,7 @@ module EBNF::LL1
338
290
  # @return [String]
339
291
  def unescape(string)
340
292
  if @options[:unescape]
341
- Lexer.unescape_string(Lexer.unescape_codepoints(string))
293
+ EBNF::Unescape.unescape(string)
342
294
  else
343
295
  string
344
296
  end
data/lib/ebnf/native.rb CHANGED
@@ -287,10 +287,10 @@ module EBNF
287
287
  case m = s[0,1]
288
288
  when '"', "'" # STRING1 or STRING2
289
289
  l, s = s[1..-1].split(m.rstrip, 2)
290
- [LL1::Lexer.unescape_string(l), s]
290
+ [Unescape.unescape_string(l), s]
291
291
  when '[' # RANGE, O_RANGE
292
292
  l, s = s[1..-1].split(/(?<=[^\\])\]/, 2)
293
- [[:range, LL1::Lexer.unescape_string(l)], s]
293
+ [[:range, Unescape.unescape_string(l)], s]
294
294
  when '#' # HEX
295
295
  s.match(/(#x\h+)(.*)$/)
296
296
  l, s = $1, $2
@@ -55,6 +55,7 @@ module EBNF::PEG
55
55
  def production_handlers; (@production_handlers ||= {}); end
56
56
  def terminal_handlers; (@terminal_handlers ||= {}); end
57
57
  def terminal_regexps; (@terminal_regexps ||= {}); end
58
+ def terminal_options; (@terminal_options ||= {}); end
58
59
 
59
60
  ##
60
61
  # Defines the pattern for a terminal node and a block to be invoked
@@ -72,9 +73,8 @@ module EBNF::PEG
72
73
  # defaults to the expression defined in the associated rule.
73
74
  # If unset, the terminal rule is used for matching.
74
75
  # @param [Hash] options
75
- # @option options [Hash{String => String}] :map ({})
76
- # A mapping from terminals, in lower-case form, to
77
- # their canonical value
76
+ # @option options [Boolean] :unescape
77
+ # Cause strings and codepoints to be unescaped.
78
78
  # @yield [value, prod]
79
79
  # @yieldparam [String] value
80
80
  # The scanned terminal value.
@@ -86,6 +86,7 @@ module EBNF::PEG
86
86
  def terminal(term, regexp = nil, **options, &block)
87
87
  terminal_regexps[term] = regexp if regexp
88
88
  terminal_handlers[term] = block if block_given?
89
+ terminal_options[term] = options.freeze
89
90
  end
90
91
 
91
92
  ##
@@ -100,6 +101,8 @@ module EBNF::PEG
100
101
  # Options which are returned from {Parser#onStart}.
101
102
  # @option options [Boolean] :as_hash (false)
102
103
  # If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
104
+ # @option options[:upper, :lower] :insensitive_strings
105
+ # Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case.
103
106
  # @yield [data, block]
104
107
  # @yieldparam [Hash] data
105
108
  # A Hash defined for the current production, during :start
@@ -182,6 +185,8 @@ module EBNF::PEG
182
185
  # @option options[Integer] :high_water passed to lexer
183
186
  # @option options [Logger] :logger for errors/progress/debug.
184
187
  # @option options[Integer] :low_water passed to lexer
188
+ # @option options[Boolean] :seq_hash (false)
189
+ # If `true`, sets the default for the value sent to a production handler that is for a `seq` to a hash composed of the flattened consitutent hashes that are otherwise provided.
185
190
  # @option options [Symbol, Regexp] :whitespace
186
191
  # Symbol of whitespace rule (defaults to `@pass`), or a regular expression
187
192
  # for eating whitespace between non-terminal rules (strongly encouraged).
@@ -195,6 +200,7 @@ module EBNF::PEG
195
200
  # @raise [Exception] Raises exceptions for parsing errors
196
201
  # or errors raised during processing callbacks. Internal
197
202
  # errors are raised using {Error}.
203
+ # @todo FIXME implement seq_hash
198
204
  def parse(input = nil, start = nil, rules = nil, **options, &block)
199
205
  start ||= options[:start]
200
206
  rules ||= options[:rules] || []
@@ -467,10 +473,19 @@ module EBNF::PEG
467
473
  #
468
474
  # @param [Symbol] sym
469
475
  # @return [Regexp]
470
- def find_terminal_regexp(sym)
476
+ def terminal_regexp(sym)
471
477
  self.class.terminal_regexps[sym]
472
478
  end
473
479
 
480
+ ##
481
+ # Find a regular expression defined for a terminal
482
+ #
483
+ # @param [Symbol] sym
484
+ # @return [Regexp]
485
+ def terminal_options(sym)
486
+ self.class.terminal_options[sym]
487
+ end
488
+
474
489
  ##
475
490
  # Record furthest failure.
476
491
  #
data/lib/ebnf/peg/rule.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  module EBNF::PEG
2
2
  # Behaviior for parsing a PEG rule
3
3
  module Rule
4
+ include ::EBNF::Unescape
5
+
4
6
  ##
5
7
  # Initialized by parser when loading rules.
6
8
  # Used for finding rules and invoking elements of the parse process.
@@ -45,9 +47,18 @@ module EBNF::PEG
45
47
  # If the terminal is defined with a regular expression,
46
48
  # use that to match the input,
47
49
  # otherwise,
48
- if regexp = parser.find_terminal_regexp(sym)
49
- matched = input.scan(regexp)
50
+ if regexp = parser.terminal_regexp(sym)
51
+ term_opts = parser.terminal_options(sym)
52
+ if matched = input.scan(regexp)
53
+ # Optionally map matched
54
+ matched = term_opts.fetch(:map, {}).fetch(matched.downcase, matched)
55
+
56
+ # Optionally unescape matched
57
+ matched = unescape(matched) if term_opts[:unescape]
58
+ end
59
+
50
60
  result = parser.onTerminal(sym, (matched ? matched : :unmatched))
61
+
51
62
  # Update furthest failure for strings and terminals
52
63
  parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
53
64
  parser.packrat[sym][pos] = {
@@ -61,6 +72,7 @@ module EBNF::PEG
61
72
  eat_whitespace(input)
62
73
  end
63
74
  start_options = parser.onStart(sym)
75
+ string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0
64
76
 
65
77
  result = case expr.first
66
78
  when :alt
@@ -74,7 +86,12 @@ module EBNF::PEG
74
86
  raise "No rule found for #{prod}" unless rule
75
87
  rule.parse(input)
76
88
  when String
77
- input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
89
+ s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
90
+ case start_options[:insensitive_strings]
91
+ when :lower then s && s.downcase
92
+ when :upper then s && s.upcase
93
+ else s
94
+ end || :unmatched
78
95
  end
79
96
  if alt == :unmatched
80
97
  # Update furthest failure for strings and terminals
@@ -112,7 +129,7 @@ module EBNF::PEG
112
129
  raise "No rule found for #{prod}" unless rule
113
130
  rule.parse(input)
114
131
  when String
115
- input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
132
+ input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched
116
133
  end
117
134
  if res != :unmatched
118
135
  # Update furthest failure for terminals
@@ -123,7 +140,7 @@ module EBNF::PEG
123
140
  end
124
141
  when :opt
125
142
  # Result is the matched value or nil
126
- opt = rept(input, 0, 1, expr[1])
143
+ opt = rept(input, 0, 1, expr[1], string_regexp_opts, **start_options)
127
144
 
128
145
  # Update furthest failure for strings and terminals
129
146
  parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -131,7 +148,7 @@ module EBNF::PEG
131
148
  when :plus
132
149
  # Result is an array of all expressions while they match,
133
150
  # at least one must match
134
- plus = rept(input, 1, '*', expr[1])
151
+ plus = rept(input, 1, '*', expr[1], string_regexp_opts)
135
152
 
136
153
  # Update furthest failure for strings and terminals
137
154
  parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -146,7 +163,7 @@ module EBNF::PEG
146
163
  when :rept
147
164
  # Result is an array of all expressions while they match,
148
165
  # an empty array of none match
149
- rept = rept(input, expr[1], expr[2], expr[3])
166
+ rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts)
150
167
 
151
168
  # # Update furthest failure for strings and terminals
152
169
  parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
@@ -161,7 +178,12 @@ module EBNF::PEG
161
178
  raise "No rule found for #{prod}" unless rule
162
179
  rule.parse(input)
163
180
  when String
164
- input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
181
+ s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
182
+ case start_options[:insensitive_strings]
183
+ when :lower then s && s.downcase
184
+ when :upper then s && s.upcase
185
+ else s
186
+ end || :unmatched
165
187
  end
166
188
  if res == :unmatched
167
189
  # Update furthest failure for strings and terminals
@@ -182,7 +204,7 @@ module EBNF::PEG
182
204
  when :star
183
205
  # Result is an array of all expressions while they match,
184
206
  # an empty array of none match
185
- star = rept(input, 0, '*', expr[1])
207
+ star = rept(input, 0, '*', expr[1], string_regexp_opts)
186
208
 
187
209
  # Update furthest failure for strings and terminals
188
210
  parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -214,8 +236,9 @@ module EBNF::PEG
214
236
  # @param [Integer] max
215
237
  # If it is an integer, it stops matching after max entries.
216
238
  # @param [Symbol, String] prod
239
+ # @param [Integer] string_regexp_opts
217
240
  # @return [:unmatched, Array]
218
- def rept(input, min, max, prod)
241
+ def rept(input, min, max, prod, string_regexp_opts, **options)
219
242
  result = []
220
243
 
221
244
  case prod
@@ -227,9 +250,13 @@ module EBNF::PEG
227
250
  result << res
228
251
  end
229
252
  when String
230
- while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
253
+ while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max)
231
254
  eat_whitespace(input) unless terminal?
232
- result << res
255
+ result << case options[:insensitive_strings]
256
+ when :lower then res.downcase
257
+ when :upper then res.upcase
258
+ else res
259
+ end
233
260
  end
234
261
  end
235
262
 
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+ # Unsecape strings
3
+ module EBNF::Unescape
4
+ ESCAPE_CHARS = {
5
+ '\\t' => "\t", # \u0009 (tab)
6
+ '\\n' => "\n", # \u000A (line feed)
7
+ '\\r' => "\r", # \u000D (carriage return)
8
+ '\\b' => "\b", # \u0008 (backspace)
9
+ '\\f' => "\f", # \u000C (form feed)
10
+ '\\"' => '"', # \u0022 (quotation mark, double quote mark)
11
+ "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
12
+ '\\\\' => '\\' # \u005C (backslash)
13
+ }.freeze
14
+ ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
15
+ ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
16
+ ECHAR = /\\./u.freeze # More liberal unescaping
17
+ UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
18
+
19
+ ##
20
+ # Returns a copy of the given `input` string with all `\uXXXX` and
21
+ # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
22
+ # unescaped UTF-8 character counterparts.
23
+ #
24
+ # @param [String] string
25
+ # @return [String]
26
+ # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
27
+ def unescape_codepoints(string)
28
+ string = string.dup
29
+ string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
30
+
31
+ # Decode \uXXXX and \UXXXXXXXX code points:
32
+ string = string.gsub(UCHAR) do |c|
33
+ s = [(c[2..-1]).hex].pack('U*')
34
+ s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
35
+ end
36
+
37
+ string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
38
+ string
39
+ end
40
+ module_function :unescape_codepoints
41
+
42
+ ##
43
+ # Returns a copy of the given `input` string with all string escape
44
+ # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
45
+ # character counterparts.
46
+ #
47
+ # @param [String] input
48
+ # @return [String]
49
+ # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
50
+ def unescape_string(input)
51
+ input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
52
+ end
53
+ module_function :unescape_string
54
+
55
+ # Perform string and codepoint unescaping if defined for this terminal
56
+ # @param [String] string
57
+ # @return [String]
58
+ def unescape(string)
59
+ unescape_string(unescape_codepoints(string))
60
+ end
61
+ module_function :unescape
62
+ end
data/lib/ebnf/writer.rb CHANGED
@@ -181,12 +181,11 @@ module EBNF
181
181
 
182
182
  if validate
183
183
  begin
184
- require 'nokogumbo'
185
184
  # Validate the output HTML
186
185
  doc = Nokogiri::HTML5("<!DOCTYPE html>" + html_result, max_errors: 10)
187
186
  raise EncodingError, "Errors found in generated HTML:\n " +
188
187
  doc.errors.map(&:to_s).join("\n ") unless doc.errors.empty?
189
- rescue LoadError
188
+ rescue LoadError, NoMethodError
190
189
  # Skip
191
190
  end
192
191
  end
data/lib/ebnf.rb CHANGED
@@ -9,6 +9,7 @@ module EBNF
9
9
  autoload :PEG, "ebnf/peg"
10
10
  autoload :Rule, "ebnf/rule"
11
11
  autoload :Terminals,"ebnf/terminals"
12
+ autoload :Unescape, "ebnf/unescape"
12
13
  autoload :Writer, "ebnf/writer"
13
14
  autoload :VERSION, "ebnf/version"
14
15
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebnf
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.3
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-21 00:00:00.000000000 Z
11
+ date: 2021-08-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sxp
@@ -268,6 +268,7 @@ files:
268
268
  - lib/ebnf/peg/rule.rb
269
269
  - lib/ebnf/rule.rb
270
270
  - lib/ebnf/terminals.rb
271
+ - lib/ebnf/unescape.rb
271
272
  - lib/ebnf/version.rb
272
273
  - lib/ebnf/writer.rb
273
274
  homepage: https://github.com/dryruby/ebnf
@@ -289,7 +290,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
289
290
  - !ruby/object:Gem::Version
290
291
  version: '0'
291
292
  requirements: []
292
- rubygems_version: 3.2.3
293
+ rubygems_version: 3.2.15
293
294
  signing_key:
294
295
  specification_version: 4
295
296
  summary: EBNF parser and parser generator in Ruby.