RubyGems - rbscmlex - Versions diffs - 0.1.2 → 0.1.3 - Mend

rbscmlex 0.1.2 → 0.1.3

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ec8d6513252c0196840bfc388ab2cb429d5d1cb579e5fb23d034e6509ab24bf5
-  data.tar.gz: fd952eecb26646994dcfe9786fa5678b68f5ab04191009079f47acedaba1af3a
+  metadata.gz: 2860b1b9f6d206dcfacf04fba88a5c64e9670c97d42423d5fac78615c3609d3a
+  data.tar.gz: c2ab02c9febc928efa76fa47aea965c27a53099b56ff55e180304988f9eea062
 SHA512:
-  metadata.gz: 6344300170f133448eb6f6b1646ffa0e617a900c3df87b00f9a48cd6c008da73e8b9161cac4afb4d752a37c3981cd7ad1fa324c48044d484c26cd836bcfe5cc9
-  data.tar.gz: ecf7e267663faa64023c1f72ed82fac38b799d2fd370d7a6725e512780618ebbd70f121bf8031abb3738f9827dbd5e7c855ad700055b18dbbec2b79356d3f16b
+  metadata.gz: 8a008f3c549e69bfa839d6788449704afb5b2491084eacde00ce1809769d4c65fffc4f22b0843c5b7522908891146f4ad4f227ef1ae67f1f07d0a33f97bbba3a
+  data.tar.gz: 68e14c34c40ea2b4715a20d9cac1f56c6860eeb6d9d3c8e2d31b6addf66fb529217e88b272ef1ccc252cb52645fb41dc3d77a8c6d0eb5af902b936af7b851e87

data/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
 ## [Unreleased]
 - (nothing to record here)
+## [0.1.3] - 2021-05-15
+### Added
+- Add `Lexer#skip_token(offset)`
+### Changed
+- Modify `Lexer#next_token` to accept an argument to specify the
+  offset to read position.
+### Fixed
+- Fix issue #4: Some "peculiar identifiers" are regarded as illegal.
 ## [0.1.2] - 2021-05-07
 ### Added
 - Add a mechanism to initialize a Parser instance from an array of

data/lib/rbscmlex/lexer.rb CHANGED Viewed

@@ -25,43 +25,6 @@ module Rbscmlex
     end
-    # :stopdoc:
-    BOOLEAN    = /\A#(f(alse)?|t(rue)?)\Z/
-    STRING     = /\A\"[^\"]*\"\Z/
-    # idents
-    EXTENDED_CHARS = "!\\$%&\\*\\+\\-\\./:<=>\\?@\\^_~"
-    IDENT_PAT  = "[a-zA-Z_][a-zA-Z0-9#{EXTENDED_CHARS}]*"
-    IDENTIFIER = Regexp.new("\\A#{IDENT_PAT}\\Z")
-    # operators
-    ARITHMETIC_OPS = /\A[+\-*\/%]\Z/
-    COMPARISON_OPS = /\A([<>]=?|=)\Z/
-    # numbers
-    REAL_PAT   = "(([1-9][0-9]*)|0)(\.[0-9]+)?"
-    RAT_PAT    = "#{REAL_PAT}\\/#{REAL_PAT}"
-    C_REAL_PAT = "(#{REAL_PAT}|#{RAT_PAT})"
-    C_IMAG_PAT = "#{C_REAL_PAT}"
-    COMP_PAT   = "#{C_REAL_PAT}(\\+|\\-)#{C_IMAG_PAT}i"
-    REAL_NUM   = Regexp.new("\\A[+-]?#{REAL_PAT}\\Z")
-    RATIONAL   = Regexp.new("\\A[+-]?#{RAT_PAT}\\Z")
-    COMPLEX    = Regexp.new("\\A[+-]?#{COMP_PAT}\\Z")
-    PURE_IMAG  = Regexp.new("\\A[+-](#{C_IMAG_PAT})?i\\Z")
-    # char
-    SINGLE_CHAR_PAT = "."
-    SPACE_PAT       = "space"
-    NEWLINE_PAT     = "newline"
-    CHAR_PREFIX = "\#\\\\"
-    CHAR_PAT    = "(#{SINGLE_CHAR_PAT}|#{SPACE_PAT}|#{NEWLINE_PAT})"
-    CHAR        = Regexp.new("\\A#{CHAR_PREFIX}#{CHAR_PAT}\\Z")
-    # :startdoc:
     include Enumerable
     def initialize(obj, form: TOKEN_DEFAULT_FORM)
@@ -86,7 +49,8 @@ module Rbscmlex
     end
     def [](index)
-      convert(@tokens[index])
+      token = @tokens[index]
+      token and convert(token)
     end
     def each(&blk)
@@ -112,16 +76,25 @@ module Rbscmlex
       self[@current_pos]
     end
-    def next_token
-      check_pos
-      @current_pos = @next_pos
-      @next_pos += 1
+    def next_token(offset = 0)
+      check_pos(offset)
+      skip_token(offset)
       self[@current_pos]
     end
-    def peek_token(num = 0)
-      check_pos
-      self[@next_pos + num]
+    def peek_token(offset = 0)
+      # Since `peek_token` does not modify the position to read, raise
+      # StopIteration only if the next position truly exceed the
+      # bound.
+      check_pos(0)
+      self[@next_pos + offset]
+    end
+    def skip_token(offset = 0)
+      check_pos(offset)
+      @current_pos = @next_pos + offset
+      @next_pos += (1 + offset)
+      nil
     end
     def rewind
@@ -192,11 +165,39 @@ module Rbscmlex
       converter ? token.map(&converter) : tokens
     end
-    def check_pos
-      raise StopIteration if @next_pos >= size
+    def check_pos(offset = 0)
+      raise StopIteration if (@next_pos + offset) >= size
     end
-    S2R_MAP = { "(" => "( ", ")" => " ) ", "'" => " ' " } # :nodoc:
+    # :stopdoc:
+    S2R_MAP = { "(" => "( ", ")" => " ) ", "'" => " ' " }
+    BOOLEAN    = /\A#(f(alse)?|t(rue)?)\Z/
+    STRING     = /\A\"[^\"]*\"\Z/
+    # numbers
+    REAL_PAT   = "(([1-9][0-9]*)|0)(\.[0-9]+)?"
+    RAT_PAT    = "#{REAL_PAT}\\/#{REAL_PAT}"
+    C_REAL_PAT = "(#{REAL_PAT}|#{RAT_PAT})"
+    C_IMAG_PAT = "#{C_REAL_PAT}"
+    COMP_PAT   = "#{C_REAL_PAT}(\\+|\\-)#{C_IMAG_PAT}i"
+    REAL_NUM   = Regexp.new("\\A[+-]?#{REAL_PAT}\\Z")
+    RATIONAL   = Regexp.new("\\A[+-]?#{RAT_PAT}\\Z")
+    COMPLEX    = Regexp.new("\\A[+-]?#{COMP_PAT}\\Z")
+    PURE_IMAG  = Regexp.new("\\A[+-](#{C_IMAG_PAT})?i\\Z")
+    # char
+    SINGLE_CHAR_PAT = "."
+    SPACE_PAT       = "space"
+    NEWLINE_PAT     = "newline"
+    CHAR_PREFIX = "\#\\\\"
+    CHAR_PAT    = "(#{SINGLE_CHAR_PAT}|#{SPACE_PAT}|#{NEWLINE_PAT})"
+    CHAR        = Regexp.new("\\A#{CHAR_PREFIX}#{CHAR_PAT}\\Z")
+    # :startdoc:
     def tokenize(src)
       cooked = src.gsub(/[()']/, S2R_MAP)
@@ -213,24 +214,111 @@ module Rbscmlex
           Rbscmlex.new_token(:quotation, literal)
         when "#("
           Rbscmlex.new_token(:vec_lparen, literal)
+        when "|"                # not supported yet
+          Rbscmlex.new_token(:illegal, literal)
         when BOOLEAN
           Rbscmlex.new_token(:boolean, literal)
-        when IDENTIFIER
-          Rbscmlex.new_token(:identifier, literal)
         when CHAR
           Rbscmlex.new_token(:character, literal)
         when STRING
           Rbscmlex.new_token(:string, literal)
-        when ARITHMETIC_OPS, COMPARISON_OPS
-          Rbscmlex.new_token(:op_proc, literal)
         when REAL_NUM, RATIONAL, COMPLEX, PURE_IMAG
           Rbscmlex.new_token(:number, literal)
         else
-          Rbscmlex.new_token(:illegal, literal)
+          if Identifier.identifier?(literal)
+            Rbscmlex.new_token(:identifier, literal)
+          else
+            Rbscmlex.new_token(:illegal, literal)
+          end
         end
       }
     end
+    # Holds functions to check a literal is valid as an identifier
+    # defined in R7RS.
+    #
+    # Call identifier? function as follows:
+    #
+    #   Identifier.identifier?(literal)
+    #
+    # It returns true if the literal is valid as an identifier.
+    module Identifier
+      DIGIT              = "0-9"
+      LETTER             = "a-zA-Z"
+      SPECIAL_INITIAL    = "!\\$%&\\*/:<=>\\?\\^_~"
+      INITIAL            = "#{LETTER}#{SPECIAL_INITIAL}"
+      EXPLICIT_SIGN      = "\\+\\-"
+      SPECIAL_SUBSEQUENT = "#{EXPLICIT_SIGN}\\.@"
+      SUBSEQUENT         = "#{INITIAL}#{DIGIT}#{SPECIAL_SUBSEQUENT}"
+      REGEXP_INITIAL = Regexp.new("[#{INITIAL}]")
+      REGEXP_EXPLICIT_SIGN = Regexp.new("[#{EXPLICIT_SIGN}]")
+      REGEXP_SUBSEQUENT = Regexp.new("[#{SUBSEQUENT}]+")
+      def self.identifier?(literal)
+        size = literal.size
+        c = literal[0]
+        case c
+        when REGEXP_INITIAL
+          return true if size == 1
+          subsequent?(literal[1..-1])
+        when REGEXP_EXPLICIT_SIGN
+          return true if size == 1
+          if literal[1] == "."
+            dot_identifier?(literal[1..-1])
+          else
+            if sign_subsequent?(literal[1])
+              return true if size == 2
+              subsequent?(literal[2..-1])
+            else
+              false
+            end
+          end
+        when "."
+          dot_identifier?(literal)
+        else
+          false
+        end
+      end
+      def self.subsequent?(sub_literal)
+        REGEXP_SUBSEQUENT === sub_literal
+      end
+      def self.sign_subsequent?(sub_literal)
+        return false if sub_literal.size != 1
+        case sub_literal[0]
+        when REGEXP_INITIAL
+          true
+        when REGEXP_EXPLICIT_SIGN
+          true
+        when "@"
+          true
+        else
+          false
+        end
+      end
+      def self.dot_identifier?(sub_literal)
+        return false if sub_literal[0] != "."
+        return true if sub_literal.size == 1
+        if dot_subsequent?(sub_literal[1])
+          return true if sub_literal.size == 2
+          subsequent?(sub_literal[2..-1])
+        else
+          false
+        end
+      end
+      def self.dot_subsequent?(sub_literal)
+        return true if sub_literal == "."
+        sign_subsequent?(sub_literal)
+      end
+    end
     # :startdoc:
   end

data/lib/rbscmlex/token.rb CHANGED Viewed

@@ -27,8 +27,6 @@ module Rbscmlex
     :number,                  # `123`, `456.789`, `1/2`, `3+4i`
     :character,               # `#\a`
     :string,                  # `"hoge"`
-    # operators
-    :op_proc,                 # `+`, `-`, ...
     # control
     :illegal,
   ]

data/lib/rbscmlex/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
 module Rbscmlex
-  VERSION = "0.1.2"
-  RELEASE = "2021-05-07"
+  VERSION = "0.1.3"
+  RELEASE = "2021-05-15"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rbscmlex
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - mnbi
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-05-07 00:00:00.000000000 Z
+date: 2021-05-15 00:00:00.000000000 Z
 dependencies: []
 description: A simple lexical analyzer for Scheme
 email: