ebnf 2.1.3 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d3b1d692ce9936bb68c8c89722a06b90b7cfd6c42231ff57d49780a98214a325
4
- data.tar.gz: da6430584d824a1d070d364879c17b0b3b0bd8c9b32b44af841cd039e153e7f6
3
+ metadata.gz: '08b2411d5c4d34425d00259126e0d6f55c086b2c60c74e8d3ddc6a099a60ec5e'
4
+ data.tar.gz: d8185780e437d3db9c2644d62f51d497b25be130d20b79d63e3101e222180408
5
5
  SHA512:
6
- metadata.gz: 4e0bacbdef9b82ecca13fca9babcf8fe643acb3afaad077b05aebfc237f542859a1e6bea67e6b2e776c2e81b51dd7900cd24e39d9712582de6959d5e65ecfa42
7
- data.tar.gz: 3001f1864c2cf8fade2856fb7d3383f4b1ac8519d2914e54cc55b8a461f6f208d41c3fa35539367bad104a74068cc868220255ce933f301894530eacb0abf51e
6
+ metadata.gz: b972788258b8261d6e59a093268f31be74d3db13535b0db63199aae9ed36b93602d6b8034439152ce62361797eb3990b747d33a55551baa01bd2d1a9aed6bf6f
7
+ data.tar.gz: cc3b0bb1ecd8c0f0e7135989e96bb826d35f1406ecc3482e88a00f77aaf1df0c8ab5c6f98cb11b37c2c83f77b2d9d762f227d20a0bec44a27cbe48616c31f4a4
data/README.md CHANGED
@@ -93,7 +93,7 @@ Inevitably while implementing a parser for some specific grammar, a developer wi
93
93
 
94
94
  The {EBNF::Writer} class can be used to write parsed grammars out, either as formatted text, or HTML. Because grammars are written from the Abstract Syntax Tree, represented as [S-Expressions][S-Expression], this provides a means of transforming between grammar formats (e.g., W3C [EBNF][] to [ABNF][]), although with some potential loss in semantic fidelity (case-insensitive string matching vs. case-sensitive matching).
95
95
 
96
- The formatted HTML results are designed to be appropriate for including in specifications. If the [Nokogumbo](https://rubygems.org/gems/nokogumbo) gem list available, the resulting HTML encoded grammar will also be validated.
96
+ The formatted HTML results are designed to be appropriate for including in specifications.
97
97
 
98
98
  ### Parser Errors
99
99
  On a parsing failure, and exception is raised with information that may be useful in determining the source of the error.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.3
1
+ 2.2.0
@@ -32,60 +32,12 @@ module EBNF::LL1
32
32
  # @see https://en.wikipedia.org/wiki/Lexical_analysis
33
33
  class Lexer
34
34
  include Enumerable
35
-
36
- ESCAPE_CHARS = {
37
- '\\t' => "\t", # \u0009 (tab)
38
- '\\n' => "\n", # \u000A (line feed)
39
- '\\r' => "\r", # \u000D (carriage return)
40
- '\\b' => "\b", # \u0008 (backspace)
41
- '\\f' => "\f", # \u000C (form feed)
42
- '\\"' => '"', # \u0022 (quotation mark, double quote mark)
43
- "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
44
- '\\\\' => '\\' # \u005C (backslash)
45
- }.freeze
46
- ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
47
- ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
48
- ECHAR = /\\./u.freeze # More liberal unescaping
49
- UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
35
+ include ::EBNF::Unescape
50
36
 
51
37
  ##
52
38
  # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
53
39
  attr_reader :whitespace
54
40
 
55
- ##
56
- # Returns a copy of the given `input` string with all `\uXXXX` and
57
- # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
58
- # unescaped UTF-8 character counterparts.
59
- #
60
- # @param [String] string
61
- # @return [String]
62
- # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
63
- def self.unescape_codepoints(string)
64
- string = string.dup
65
- string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
66
-
67
- # Decode \uXXXX and \UXXXXXXXX code points:
68
- string = string.gsub(UCHAR) do |c|
69
- s = [(c[2..-1]).hex].pack('U*')
70
- s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
71
- end
72
-
73
- string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
74
- string
75
- end
76
-
77
- ##
78
- # Returns a copy of the given `input` string with all string escape
79
- # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
80
- # character counterparts.
81
- #
82
- # @param [String] input
83
- # @return [String]
84
- # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
85
- def self.unescape_string(input)
86
- input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
87
- end
88
-
89
41
  ##
90
42
  # Tokenizes the given `input` string or stream.
91
43
  #
@@ -338,7 +290,7 @@ module EBNF::LL1
338
290
  # @return [String]
339
291
  def unescape(string)
340
292
  if @options[:unescape]
341
- Lexer.unescape_string(Lexer.unescape_codepoints(string))
293
+ EBNF::Unescape.unescape(string)
342
294
  else
343
295
  string
344
296
  end
data/lib/ebnf/native.rb CHANGED
@@ -287,10 +287,10 @@ module EBNF
287
287
  case m = s[0,1]
288
288
  when '"', "'" # STRING1 or STRING2
289
289
  l, s = s[1..-1].split(m.rstrip, 2)
290
- [LL1::Lexer.unescape_string(l), s]
290
+ [Unescape.unescape_string(l), s]
291
291
  when '[' # RANGE, O_RANGE
292
292
  l, s = s[1..-1].split(/(?<=[^\\])\]/, 2)
293
- [[:range, LL1::Lexer.unescape_string(l)], s]
293
+ [[:range, Unescape.unescape_string(l)], s]
294
294
  when '#' # HEX
295
295
  s.match(/(#x\h+)(.*)$/)
296
296
  l, s = $1, $2
@@ -55,6 +55,7 @@ module EBNF::PEG
55
55
  def production_handlers; (@production_handlers ||= {}); end
56
56
  def terminal_handlers; (@terminal_handlers ||= {}); end
57
57
  def terminal_regexps; (@terminal_regexps ||= {}); end
58
+ def terminal_options; (@terminal_options ||= {}); end
58
59
 
59
60
  ##
60
61
  # Defines the pattern for a terminal node and a block to be invoked
@@ -72,9 +73,8 @@ module EBNF::PEG
72
73
  # defaults to the expression defined in the associated rule.
73
74
  # If unset, the terminal rule is used for matching.
74
75
  # @param [Hash] options
75
- # @option options [Hash{String => String}] :map ({})
76
- # A mapping from terminals, in lower-case form, to
77
- # their canonical value
76
+ # @option options [Boolean] :unescape
77
+ # Cause strings and codepoints to be unescaped.
78
78
  # @yield [value, prod]
79
79
  # @yieldparam [String] value
80
80
  # The scanned terminal value.
@@ -86,6 +86,7 @@ module EBNF::PEG
86
86
  def terminal(term, regexp = nil, **options, &block)
87
87
  terminal_regexps[term] = regexp if regexp
88
88
  terminal_handlers[term] = block if block_given?
89
+ terminal_options[term] = options.freeze
89
90
  end
90
91
 
91
92
  ##
@@ -100,6 +101,8 @@ module EBNF::PEG
100
101
  # Options which are returned from {Parser#onStart}.
101
102
  # @option options [Boolean] :as_hash (false)
102
103
  # If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
104
+ # @option options[:upper, :lower] :insensitive_strings
105
+ # Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case.
103
106
  # @yield [data, block]
104
107
  # @yieldparam [Hash] data
105
108
  # A Hash defined for the current production, during :start
@@ -182,6 +185,8 @@ module EBNF::PEG
182
185
  # @option options[Integer] :high_water passed to lexer
183
186
  # @option options [Logger] :logger for errors/progress/debug.
184
187
  # @option options[Integer] :low_water passed to lexer
188
+ # @option options[Boolean] :seq_hash (false)
189
+ # If `true`, sets the default for the value sent to a production handler that is for a `seq` to a hash composed of the flattened consitutent hashes that are otherwise provided.
185
190
  # @option options [Symbol, Regexp] :whitespace
186
191
  # Symbol of whitespace rule (defaults to `@pass`), or a regular expression
187
192
  # for eating whitespace between non-terminal rules (strongly encouraged).
@@ -195,6 +200,7 @@ module EBNF::PEG
195
200
  # @raise [Exception] Raises exceptions for parsing errors
196
201
  # or errors raised during processing callbacks. Internal
197
202
  # errors are raised using {Error}.
203
+ # @todo FIXME implement seq_hash
198
204
  def parse(input = nil, start = nil, rules = nil, **options, &block)
199
205
  start ||= options[:start]
200
206
  rules ||= options[:rules] || []
@@ -467,10 +473,19 @@ module EBNF::PEG
467
473
  #
468
474
  # @param [Symbol] sym
469
475
  # @return [Regexp]
470
- def find_terminal_regexp(sym)
476
+ def terminal_regexp(sym)
471
477
  self.class.terminal_regexps[sym]
472
478
  end
473
479
 
480
+ ##
481
+ # Find a regular expression defined for a terminal
482
+ #
483
+ # @param [Symbol] sym
484
+ # @return [Regexp]
485
+ def terminal_options(sym)
486
+ self.class.terminal_options[sym]
487
+ end
488
+
474
489
  ##
475
490
  # Record furthest failure.
476
491
  #
data/lib/ebnf/peg/rule.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  module EBNF::PEG
2
2
  # Behaviior for parsing a PEG rule
3
3
  module Rule
4
+ include ::EBNF::Unescape
5
+
4
6
  ##
5
7
  # Initialized by parser when loading rules.
6
8
  # Used for finding rules and invoking elements of the parse process.
@@ -45,9 +47,18 @@ module EBNF::PEG
45
47
  # If the terminal is defined with a regular expression,
46
48
  # use that to match the input,
47
49
  # otherwise,
48
- if regexp = parser.find_terminal_regexp(sym)
49
- matched = input.scan(regexp)
50
+ if regexp = parser.terminal_regexp(sym)
51
+ term_opts = parser.terminal_options(sym)
52
+ if matched = input.scan(regexp)
53
+ # Optionally map matched
54
+ matched = term_opts.fetch(:map, {}).fetch(matched.downcase, matched)
55
+
56
+ # Optionally unescape matched
57
+ matched = unescape(matched) if term_opts[:unescape]
58
+ end
59
+
50
60
  result = parser.onTerminal(sym, (matched ? matched : :unmatched))
61
+
51
62
  # Update furthest failure for strings and terminals
52
63
  parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
53
64
  parser.packrat[sym][pos] = {
@@ -61,6 +72,7 @@ module EBNF::PEG
61
72
  eat_whitespace(input)
62
73
  end
63
74
  start_options = parser.onStart(sym)
75
+ string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0
64
76
 
65
77
  result = case expr.first
66
78
  when :alt
@@ -74,7 +86,12 @@ module EBNF::PEG
74
86
  raise "No rule found for #{prod}" unless rule
75
87
  rule.parse(input)
76
88
  when String
77
- input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
89
+ s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
90
+ case start_options[:insensitive_strings]
91
+ when :lower then s && s.downcase
92
+ when :upper then s && s.upcase
93
+ else s
94
+ end || :unmatched
78
95
  end
79
96
  if alt == :unmatched
80
97
  # Update furthest failure for strings and terminals
@@ -112,7 +129,7 @@ module EBNF::PEG
112
129
  raise "No rule found for #{prod}" unless rule
113
130
  rule.parse(input)
114
131
  when String
115
- input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
132
+ input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched
116
133
  end
117
134
  if res != :unmatched
118
135
  # Update furthest failure for terminals
@@ -123,7 +140,7 @@ module EBNF::PEG
123
140
  end
124
141
  when :opt
125
142
  # Result is the matched value or nil
126
- opt = rept(input, 0, 1, expr[1])
143
+ opt = rept(input, 0, 1, expr[1], string_regexp_opts, **start_options)
127
144
 
128
145
  # Update furthest failure for strings and terminals
129
146
  parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -131,7 +148,7 @@ module EBNF::PEG
131
148
  when :plus
132
149
  # Result is an array of all expressions while they match,
133
150
  # at least one must match
134
- plus = rept(input, 1, '*', expr[1])
151
+ plus = rept(input, 1, '*', expr[1], string_regexp_opts)
135
152
 
136
153
  # Update furthest failure for strings and terminals
137
154
  parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -146,7 +163,7 @@ module EBNF::PEG
146
163
  when :rept
147
164
  # Result is an array of all expressions while they match,
148
165
  # an empty array of none match
149
- rept = rept(input, expr[1], expr[2], expr[3])
166
+ rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts)
150
167
 
151
168
  # # Update furthest failure for strings and terminals
152
169
  parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
@@ -161,7 +178,12 @@ module EBNF::PEG
161
178
  raise "No rule found for #{prod}" unless rule
162
179
  rule.parse(input)
163
180
  when String
164
- input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
181
+ s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
182
+ case start_options[:insensitive_strings]
183
+ when :lower then s && s.downcase
184
+ when :upper then s && s.upcase
185
+ else s
186
+ end || :unmatched
165
187
  end
166
188
  if res == :unmatched
167
189
  # Update furthest failure for strings and terminals
@@ -182,7 +204,7 @@ module EBNF::PEG
182
204
  when :star
183
205
  # Result is an array of all expressions while they match,
184
206
  # an empty array of none match
185
- star = rept(input, 0, '*', expr[1])
207
+ star = rept(input, 0, '*', expr[1], string_regexp_opts)
186
208
 
187
209
  # Update furthest failure for strings and terminals
188
210
  parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -214,8 +236,9 @@ module EBNF::PEG
214
236
  # @param [Integer] max
215
237
  # If it is an integer, it stops matching after max entries.
216
238
  # @param [Symbol, String] prod
239
+ # @param [Integer] string_regexp_opts
217
240
  # @return [:unmatched, Array]
218
- def rept(input, min, max, prod)
241
+ def rept(input, min, max, prod, string_regexp_opts, **options)
219
242
  result = []
220
243
 
221
244
  case prod
@@ -227,9 +250,13 @@ module EBNF::PEG
227
250
  result << res
228
251
  end
229
252
  when String
230
- while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
253
+ while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max)
231
254
  eat_whitespace(input) unless terminal?
232
- result << res
255
+ result << case options[:insensitive_strings]
256
+ when :lower then res.downcase
257
+ when :upper then res.upcase
258
+ else res
259
+ end
233
260
  end
234
261
  end
235
262
 
@@ -0,0 +1,62 @@
1
+ # encoding: utf-8
2
+ # Unsecape strings
3
+ module EBNF::Unescape
4
+ ESCAPE_CHARS = {
5
+ '\\t' => "\t", # \u0009 (tab)
6
+ '\\n' => "\n", # \u000A (line feed)
7
+ '\\r' => "\r", # \u000D (carriage return)
8
+ '\\b' => "\b", # \u0008 (backspace)
9
+ '\\f' => "\f", # \u000C (form feed)
10
+ '\\"' => '"', # \u0022 (quotation mark, double quote mark)
11
+ "\\'" => '\'', # \u0027 (apostrophe-quote, single quote mark)
12
+ '\\\\' => '\\' # \u005C (backslash)
13
+ }.freeze
14
+ ESCAPE_CHAR4 = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze # \uXXXX
15
+ ESCAPE_CHAR8 = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze # \UXXXXXXXX
16
+ ECHAR = /\\./u.freeze # More liberal unescaping
17
+ UCHAR = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
18
+
19
+ ##
20
+ # Returns a copy of the given `input` string with all `\uXXXX` and
21
+ # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
22
+ # unescaped UTF-8 character counterparts.
23
+ #
24
+ # @param [String] string
25
+ # @return [String]
26
+ # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
27
+ def unescape_codepoints(string)
28
+ string = string.dup
29
+ string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
30
+
31
+ # Decode \uXXXX and \UXXXXXXXX code points:
32
+ string = string.gsub(UCHAR) do |c|
33
+ s = [(c[2..-1]).hex].pack('U*')
34
+ s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
35
+ end
36
+
37
+ string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
38
+ string
39
+ end
40
+ module_function :unescape_codepoints
41
+
42
+ ##
43
+ # Returns a copy of the given `input` string with all string escape
44
+ # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
45
+ # character counterparts.
46
+ #
47
+ # @param [String] input
48
+ # @return [String]
49
+ # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
50
+ def unescape_string(input)
51
+ input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
52
+ end
53
+ module_function :unescape_string
54
+
55
+ # Perform string and codepoint unescaping if defined for this terminal
56
+ # @param [String] string
57
+ # @return [String]
58
+ def unescape(string)
59
+ unescape_string(unescape_codepoints(string))
60
+ end
61
+ module_function :unescape
62
+ end
data/lib/ebnf/writer.rb CHANGED
@@ -181,12 +181,11 @@ module EBNF
181
181
 
182
182
  if validate
183
183
  begin
184
- require 'nokogumbo'
185
184
  # Validate the output HTML
186
185
  doc = Nokogiri::HTML5("<!DOCTYPE html>" + html_result, max_errors: 10)
187
186
  raise EncodingError, "Errors found in generated HTML:\n " +
188
187
  doc.errors.map(&:to_s).join("\n ") unless doc.errors.empty?
189
- rescue LoadError
188
+ rescue LoadError, NoMethodError
190
189
  # Skip
191
190
  end
192
191
  end
data/lib/ebnf.rb CHANGED
@@ -9,6 +9,7 @@ module EBNF
9
9
  autoload :PEG, "ebnf/peg"
10
10
  autoload :Rule, "ebnf/rule"
11
11
  autoload :Terminals,"ebnf/terminals"
12
+ autoload :Unescape, "ebnf/unescape"
12
13
  autoload :Writer, "ebnf/writer"
13
14
  autoload :VERSION, "ebnf/version"
14
15
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebnf
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.3
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregg Kellogg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-21 00:00:00.000000000 Z
11
+ date: 2021-08-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sxp
@@ -268,6 +268,7 @@ files:
268
268
  - lib/ebnf/peg/rule.rb
269
269
  - lib/ebnf/rule.rb
270
270
  - lib/ebnf/terminals.rb
271
+ - lib/ebnf/unescape.rb
271
272
  - lib/ebnf/version.rb
272
273
  - lib/ebnf/writer.rb
273
274
  homepage: https://github.com/dryruby/ebnf
@@ -289,7 +290,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
289
290
  - !ruby/object:Gem::Version
290
291
  version: '0'
291
292
  requirements: []
292
- rubygems_version: 3.2.3
293
+ rubygems_version: 3.2.15
293
294
  signing_key:
294
295
  specification_version: 4
295
296
  summary: EBNF parser and parser generator in Ruby.