RubyGems - ebnf - Versions diffs - 2.1.3 → 2.2.0 - Mend

ebnf 2.1.3 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d3b1d692ce9936bb68c8c89722a06b90b7cfd6c42231ff57d49780a98214a325
-  data.tar.gz: da6430584d824a1d070d364879c17b0b3b0bd8c9b32b44af841cd039e153e7f6
+  metadata.gz: '08b2411d5c4d34425d00259126e0d6f55c086b2c60c74e8d3ddc6a099a60ec5e'
+  data.tar.gz: d8185780e437d3db9c2644d62f51d497b25be130d20b79d63e3101e222180408
 SHA512:
-  metadata.gz: 4e0bacbdef9b82ecca13fca9babcf8fe643acb3afaad077b05aebfc237f542859a1e6bea67e6b2e776c2e81b51dd7900cd24e39d9712582de6959d5e65ecfa42
-  data.tar.gz: 3001f1864c2cf8fade2856fb7d3383f4b1ac8519d2914e54cc55b8a461f6f208d41c3fa35539367bad104a74068cc868220255ce933f301894530eacb0abf51e
+  metadata.gz: b972788258b8261d6e59a093268f31be74d3db13535b0db63199aae9ed36b93602d6b8034439152ce62361797eb3990b747d33a55551baa01bd2d1a9aed6bf6f
+  data.tar.gz: cc3b0bb1ecd8c0f0e7135989e96bb826d35f1406ecc3482e88a00f77aaf1df0c8ab5c6f98cb11b37c2c83f77b2d9d762f227d20a0bec44a27cbe48616c31f4a4

data/README.md CHANGED Viewed

@@ -93,7 +93,7 @@ Inevitably while implementing a parser for some specific grammar, a developer wi
 The {EBNF::Writer} class can be used to write parsed grammars out, either as formatted text, or HTML. Because grammars are written from the Abstract Syntax Tree, represented as [S-Expressions][S-Expression], this provides a means of transforming between grammar formats (e.g., W3C [EBNF][] to [ABNF][]), although with some potential loss in semantic fidelity (case-insensitive string matching vs. case-sensitive matching).
-The formatted HTML results are designed to be appropriate for including in specifications. If the [Nokogumbo](https://rubygems.org/gems/nokogumbo) gem list available, the resulting HTML encoded grammar will also be validated.
+The formatted HTML results are designed to be appropriate for including in specifications.
 ### Parser Errors
 On a parsing failure, and exception is raised with information that may be useful in determining the source of the error.

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 2.1.3
1	+ 2.2.0

data/lib/ebnf/ll1/lexer.rb CHANGED Viewed

@@ -32,60 +32,12 @@ module EBNF::LL1
   # @see https://en.wikipedia.org/wiki/Lexical_analysis
   class Lexer
     include Enumerable
-    ESCAPE_CHARS         = {
-      '\\t'   => "\t",  # \u0009 (tab)
-      '\\n'   => "\n",  # \u000A (line feed)
-      '\\r'   => "\r",  # \u000D (carriage return)
-      '\\b'   => "\b",  # \u0008 (backspace)
-      '\\f'   => "\f",  # \u000C (form feed)
-      '\\"'  => '"',    # \u0022 (quotation mark, double quote mark)
-      "\\'"  => '\'',   # \u0027 (apostrophe-quote, single quote mark)
-      '\\\\' => '\\'    # \u005C (backslash)
-    }.freeze
-    ESCAPE_CHAR4        = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze    # \uXXXX
-    ESCAPE_CHAR8        = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze    # \UXXXXXXXX
-    ECHAR               = /\\./u.freeze                        # More liberal unescaping
-    UCHAR               = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
+    include ::EBNF::Unescape
     ##
     # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals
     attr_reader :whitespace
-    ##
-    # Returns a copy of the given `input` string with all `\uXXXX` and
-    # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
-    # unescaped UTF-8 character counterparts.
-    #
-    # @param  [String] string
-    # @return [String]
-    # @see    https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
-    def self.unescape_codepoints(string)
-      string = string.dup
-      string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
-      # Decode \uXXXX and \UXXXXXXXX code points:
-      string = string.gsub(UCHAR) do |c|
-        s = [(c[2..-1]).hex].pack('U*')
-        s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
-      end
-      string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
-      string
-    end
-    ##
-    # Returns a copy of the given `input` string with all string escape
-    # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
-    # character counterparts.
-    #
-    # @param  [String] input
-    # @return [String]
-    # @see    https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
-    def self.unescape_string(input)
-      input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
-    end
     ##
     # Tokenizes the given `input` string or stream.
     #
@@ -338,7 +290,7 @@ module EBNF::LL1
       # @return [String]
       def unescape(string)
         if @options[:unescape]
-          Lexer.unescape_string(Lexer.unescape_codepoints(string))
+          EBNF::Unescape.unescape(string)
         else
           string
         end

data/lib/ebnf/native.rb CHANGED Viewed

@@ -287,10 +287,10 @@ module EBNF
       case m = s[0,1]
       when '"', "'" # STRING1 or STRING2
         l, s = s[1..-1].split(m.rstrip, 2)
-        [LL1::Lexer.unescape_string(l), s]
+        [Unescape.unescape_string(l), s]
       when '[' # RANGE, O_RANGE
         l, s = s[1..-1].split(/(?<=[^\\])\]/, 2)
-        [[:range, LL1::Lexer.unescape_string(l)], s]
+        [[:range, Unescape.unescape_string(l)], s]
       when '#' # HEX
         s.match(/(#x\h+)(.*)$/)
         l, s = $1, $2

data/lib/ebnf/peg/parser.rb CHANGED Viewed

@@ -55,6 +55,7 @@ module EBNF::PEG
       def production_handlers; (@production_handlers ||= {}); end
       def terminal_handlers; (@terminal_handlers ||= {}); end
       def terminal_regexps; (@terminal_regexps ||= {}); end
+      def terminal_options; (@terminal_options ||= {}); end
       ##
       # Defines the pattern for a terminal node and a block to be invoked
@@ -72,9 +73,8 @@ module EBNF::PEG
       #   defaults to the expression defined in the associated rule.
       #   If unset, the terminal rule is used for matching.
       # @param [Hash] options
-      # @option options [Hash{String => String}] :map ({})
-      #   A mapping from terminals, in lower-case form, to
-      #   their canonical value
+      # @option options [Boolean] :unescape
+      #   Cause strings and codepoints to be unescaped.
       # @yield [value, prod]
       # @yieldparam [String] value
       #   The scanned terminal value.
@@ -86,6 +86,7 @@ module EBNF::PEG
       def terminal(term, regexp = nil, **options, &block)
         terminal_regexps[term] = regexp if regexp
         terminal_handlers[term] = block if block_given?
+        terminal_options[term] = options.freeze
       end
       ##
@@ -100,6 +101,8 @@ module EBNF::PEG
       #   Options which are returned from {Parser#onStart}.
       # @option options [Boolean] :as_hash (false)
       #   If the production is a `seq`, causes the value to be represented as a single hash, rather than an array of individual hashes for each sub-production. Note that this is not always advisable due to the possibility of repeated productions within the sequence.
+      # @option options[:upper, :lower] :insensitive_strings
+      #   Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case.
       # @yield [data, block]
       # @yieldparam [Hash] data
       #   A Hash defined for the current production, during :start
@@ -182,6 +185,8 @@ module EBNF::PEG
     # @option options[Integer] :high_water passed to lexer
     # @option options [Logger] :logger for errors/progress/debug.
     # @option options[Integer] :low_water passed to lexer
+    # @option options[Boolean] :seq_hash (false)
+    #   If `true`, sets the default for the value sent to a production handler that is for a `seq` to a hash composed of the flattened consitutent hashes that are otherwise provided.
     # @option options [Symbol, Regexp] :whitespace
     #   Symbol of whitespace rule (defaults to `@pass`), or a regular expression
     #   for eating whitespace between non-terminal rules (strongly encouraged).
@@ -195,6 +200,7 @@ module EBNF::PEG
     # @raise [Exception] Raises exceptions for parsing errors
     #   or errors raised during processing callbacks. Internal
     #   errors are raised using {Error}.
+    # @todo FIXME implement seq_hash
     def parse(input = nil, start = nil, rules = nil, **options, &block)
       start ||= options[:start]
       rules ||= options[:rules] || []
@@ -467,10 +473,19 @@ module EBNF::PEG
     #
     # @param [Symbol] sym
     # @return [Regexp]
-    def find_terminal_regexp(sym)
+    def terminal_regexp(sym)
       self.class.terminal_regexps[sym]
     end
+    ##
+    # Find a regular expression defined for a terminal
+    #
+    # @param [Symbol] sym
+    # @return [Regexp]
+    def terminal_options(sym)
+      self.class.terminal_options[sym]
+    end
     ##
     # Record furthest failure.
     #

data/lib/ebnf/peg/rule.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module EBNF::PEG
   # Behaviior for parsing a PEG rule
   module Rule
+    include ::EBNF::Unescape
     ##
     # Initialized by parser when loading rules.
     # Used for finding rules and invoking elements of the parse process.
@@ -45,9 +47,18 @@ module EBNF::PEG
         # If the terminal is defined with a regular expression,
         # use that to match the input,
         # otherwise,
-        if regexp = parser.find_terminal_regexp(sym)
-          matched = input.scan(regexp)
+        if regexp = parser.terminal_regexp(sym)
+          term_opts = parser.terminal_options(sym)
+          if matched = input.scan(regexp)
+            # Optionally map matched
+            matched = term_opts.fetch(:map, {}).fetch(matched.downcase, matched)
+            # Optionally unescape matched
+            matched = unescape(matched) if term_opts[:unescape]
+          end
           result = parser.onTerminal(sym, (matched ? matched : :unmatched))
           # Update furthest failure for strings and terminals
           parser.update_furthest_failure(input.pos, input.lineno, sym) if result == :unmatched
           parser.packrat[sym][pos] = {
@@ -61,6 +72,7 @@ module EBNF::PEG
         eat_whitespace(input)
       end
       start_options = parser.onStart(sym)
+      string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0
       result = case expr.first
       when :alt
@@ -74,7 +86,12 @@ module EBNF::PEG
             raise "No rule found for #{prod}" unless rule
             rule.parse(input)
           when String
-            input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+            s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
+            case start_options[:insensitive_strings]
+            when :lower then s && s.downcase
+            when :upper then s && s.upcase
+            else s
+            end || :unmatched
           end
           if alt == :unmatched
             # Update furthest failure for strings and terminals
@@ -112,7 +129,7 @@ module EBNF::PEG
           raise "No rule found for #{prod}" unless rule
           rule.parse(input)
         when String
-          input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+          input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched
         end
         if res != :unmatched
           # Update furthest failure for terminals
@@ -123,7 +140,7 @@ module EBNF::PEG
         end
       when :opt
         # Result is the matched value or nil
-        opt = rept(input, 0, 1, expr[1])
+        opt = rept(input, 0, 1, expr[1], string_regexp_opts, **start_options)
         # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -131,7 +148,7 @@ module EBNF::PEG
       when :plus
         # Result is an array of all expressions while they match,
         # at least one must match
-        plus = rept(input, 1, '*', expr[1])
+        plus = rept(input, 1, '*', expr[1], string_regexp_opts)
         # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -146,7 +163,7 @@ module EBNF::PEG
       when :rept
         # Result is an array of all expressions while they match,
         # an empty array of none match
-        rept = rept(input, expr[1], expr[2], expr[3])
+        rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts)
         # # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal?
@@ -161,7 +178,12 @@ module EBNF::PEG
             raise "No rule found for #{prod}" unless rule
             rule.parse(input)
           when String
-            input.scan(Regexp.new(Regexp.quote(prod))) || :unmatched
+            s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))
+            case start_options[:insensitive_strings]
+            when :lower then s && s.downcase
+            when :upper then s && s.upcase
+            else s
+            end || :unmatched
           end
           if res == :unmatched
             # Update furthest failure for strings and terminals
@@ -182,7 +204,7 @@ module EBNF::PEG
       when :star
         # Result is an array of all expressions while they match,
         # an empty array of none match
-        star = rept(input, 0, '*', expr[1])
+        star = rept(input, 0, '*', expr[1], string_regexp_opts)
         # Update furthest failure for strings and terminals
         parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal?
@@ -214,8 +236,9 @@ module EBNF::PEG
     # @param [Integer] max
     #   If it is an integer, it stops matching after max entries.
     # @param [Symbol, String] prod
+    # @param [Integer] string_regexp_opts
     # @return [:unmatched, Array]
-    def rept(input, min, max, prod)
+    def rept(input, min, max, prod, string_regexp_opts, **options)
       result = []
       case prod
@@ -227,9 +250,13 @@ module EBNF::PEG
           result << res
         end
       when String
-        while (res = input.scan(Regexp.new(Regexp.quote(prod)))) && (max == '*' || result.length < max)
+        while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max)
           eat_whitespace(input) unless terminal?
-          result << res
+          result << case options[:insensitive_strings]
+          when :lower then res.downcase
+          when :upper then res.upcase
+          else res
+          end
         end
       end

data/lib/ebnf/unescape.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# encoding: utf-8
+# Unsecape strings
+module EBNF::Unescape
+  ESCAPE_CHARS         = {
+    '\\t'   => "\t",  # \u0009 (tab)
+    '\\n'   => "\n",  # \u000A (line feed)
+    '\\r'   => "\r",  # \u000D (carriage return)
+    '\\b'   => "\b",  # \u0008 (backspace)
+    '\\f'   => "\f",  # \u000C (form feed)
+    '\\"'  => '"',    # \u0022 (quotation mark, double quote mark)
+    "\\'"  => '\'',   # \u0027 (apostrophe-quote, single quote mark)
+    '\\\\' => '\\'    # \u005C (backslash)
+  }.freeze
+  ESCAPE_CHAR4        = /\\u(?:[0-9A-Fa-f]{4,4})/u.freeze    # \uXXXX
+  ESCAPE_CHAR8        = /\\U(?:[0-9A-Fa-f]{8,8})/u.freeze    # \UXXXXXXXX
+  ECHAR               = /\\./u.freeze                        # More liberal unescaping
+  UCHAR               = /#{ESCAPE_CHAR4}|#{ESCAPE_CHAR8}/n.freeze
+  ##
+  # Returns a copy of the given `input` string with all `\uXXXX` and
+  # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their
+  # unescaped UTF-8 character counterparts.
+  #
+  # @param  [String] string
+  # @return [String]
+  # @see    https://www.w3.org/TR/rdf-sparql-query/#codepointEscape
+  def unescape_codepoints(string)
+    string = string.dup
+    string.force_encoding(Encoding::ASCII_8BIT) if string.respond_to?(:force_encoding)
+    # Decode \uXXXX and \UXXXXXXXX code points:
+    string = string.gsub(UCHAR) do |c|
+      s = [(c[2..-1]).hex].pack('U*')
+      s.respond_to?(:force_encoding) ? s.force_encoding(Encoding::ASCII_8BIT) : s
+    end
+    string.force_encoding(Encoding::UTF_8) if string.respond_to?(:force_encoding)
+    string
+  end
+  module_function :unescape_codepoints
+  ##
+  # Returns a copy of the given `input` string with all string escape
+  # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8
+  # character counterparts.
+  #
+  # @param  [String] input
+  # @return [String]
+  # @see    https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes
+  def unescape_string(input)
+    input.gsub(ECHAR) { |escaped| ESCAPE_CHARS[escaped] || escaped[1..-1]}
+  end
+  module_function :unescape_string
+  # Perform string and codepoint unescaping if defined for this terminal
+  # @param [String] string
+  # @return [String]
+  def unescape(string)
+    unescape_string(unescape_codepoints(string))
+  end
+  module_function :unescape
+end

data/lib/ebnf/writer.rb CHANGED Viewed

@@ -181,12 +181,11 @@ module EBNF
           if validate
             begin
-              require 'nokogumbo'
               # Validate the output HTML
               doc = Nokogiri::HTML5("<!DOCTYPE html>" + html_result, max_errors: 10)
               raise EncodingError, "Errors found in generated HTML:\n  " +
                 doc.errors.map(&:to_s).join("\n  ") unless doc.errors.empty?
-            rescue LoadError
+            rescue LoadError, NoMethodError
               # Skip
             end
           end

data/lib/ebnf.rb CHANGED Viewed

@@ -9,6 +9,7 @@ module EBNF
   autoload :PEG,      "ebnf/peg"
   autoload :Rule,     "ebnf/rule"
   autoload :Terminals,"ebnf/terminals"
+  autoload :Unescape, "ebnf/unescape"
   autoload :Writer,   "ebnf/writer"
   autoload :VERSION,  "ebnf/version"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ebnf
 version: !ruby/object:Gem::Version
-  version: 2.1.3
+  version: 2.2.0
 platform: ruby
 authors:
 - Gregg Kellogg
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-04-21 00:00:00.000000000 Z
+date: 2021-08-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sxp
@@ -268,6 +268,7 @@ files:
 - lib/ebnf/peg/rule.rb
 - lib/ebnf/rule.rb
 - lib/ebnf/terminals.rb
+- lib/ebnf/unescape.rb
 - lib/ebnf/version.rb
 - lib/ebnf/writer.rb
 homepage: https://github.com/dryruby/ebnf
@@ -289,7 +290,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.3
+rubygems_version: 3.2.15
 signing_key:
 specification_version: 4
 summary: EBNF parser and parser generator in Ruby.