RubyGems - code-lexer - Versions diffs - 0.6 → 0.8 - Mend

code-lexer 0.6 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/code-lexer/abstractor.rb +195 -55
data/lib/code-lexer/config.rb +1 -2
data/lib/code-lexer/languages/java.yml +38 -0
data/lib/code-lexer/languages/javascript.yml +6 -4
data/lib/code-lexer/lexer.rb +80 -7
data/lib/code-lexer/token.rb +24 -7
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0be86a493e60a21bc3c8d16598c6f6af914a2521255bb604b01ffa9410ff399f
-  data.tar.gz: 46e60be81ceda1cc9621ef7519189a45cceaaa54afa27313a5c6d4c8be0e8377
+  metadata.gz: 28e3de74936d4c5e81abc995cec85c4831c2383eedc098ef9097c45002e59bbb
+  data.tar.gz: a7f113035e970f213de2e0301454b6851eba80f661adb4ce4720545e280fbcef
 SHA512:
-  metadata.gz: adcc4ee93e5c58da53df775584dbd4de485d8af7af232e2a7aa379a1ec09c1e928a804139983d585773f3b837bdb6f9fe95623107dfceaa0fb3afc2ddd5a7593
-  data.tar.gz: e9ee16aa762e15515d58930b29b5c8f784baee483f40fa1ae520734a0efc98f7680ffa5a4f6c88de3c261b836e17bb9e529aa4adc55314928d3a60f7f0d0d2da
+  metadata.gz: 1ccf664386fd4ca8b505658d0059d1d44a4b428b84ab79ed5ea1f9d12219f7daa6261d1d058b797080d8719c1d842fad47441d2b99da578c4691c00e5e109efe
+  data.tar.gz: 65914ad6a9f937ce884a5e7c737e4e9857b5be9aa5dc81ba912d6c25a44deff5efeaff2b11b1a0cfe6ad7b77859a2862fe745d9a280b3f639be2e3cb3a678d6e

data/lib/code-lexer/abstractor.rb CHANGED Viewed

@@ -1,11 +1,14 @@
 require_relative 'token'
 module CodeLexer
-    class Abstractor
-        attr_reader     :dictionary
-        def initialize(dictionary=[])
-            @dictionary = ["NOOP"] + dictionary
+    class Abstractor
+        def initialize(identifiers_dictionary = [], strings_dictionary = [], numbers_dictionary = [])
+            @dictionary = {}
+            @dictionary[:identifiers] = ['NOOP'] + identifiers_dictionary
+            @dictionary[:strings] = strings_dictionary
+            @dictionary[:numbers] = numbers_dictionary
+            @abstractor_pieces = []
         end
         def abstract_everything
@@ -18,103 +21,240 @@ module CodeLexer
             return self
         end
+        def dictionary
+            warn "[DEPRECATION] The method CodeLexer::Abstractor#dictionary is deprecated; used CodeLexer::Abstractor#identifiers_dictionary instead"
+            self.identifiers_dictionary
+        end
+        def identifiers_dictionary
+            @dictionary[:identifiers]
+        end
+        def strings_dictionary
+            @dictionary[:strings]
+        end
+        def numbers_dictionary
+            @dictionary[:numbers]
+        end
+        def dictionaries
+            @dictionary
+        end
         def abstract_identifiers
-            @abstract_identifiers = true
+            @abstractor_pieces << IdentifierAbstractor.new(self)
             return self
         end
         def abstract_numbers
-            @abstract_numbers = true
+            @abstractor_pieces << NumberAbstractor.new(self)
             return self
         end
         def abstract_comments
-            @abstract_comments = true
+            @abstractor_pieces << CommentAbstractor.new(self)
             return self
         end
         def abstract_strings
-            @abstract_strings = true
+            @abstractor_pieces << StringAbstractor.new(self)
             return self
         end
         def abstract_spaces
-            @abstract_spaces = true
+            @abstractor_pieces << SpaceAbstractor.new(self)
             return self
         end
         def remove_spaces
-            @remove_spaces = true
+            @abstractor_pieces << SpaceRemover.new(self)
             return self
         end
         def remove_newlines
-            @remove_newlines = true
+            @abstractor_pieces << NewlineRemover.new(self)
             return self
         end
         def remove_comments
-            @remove_comments = true
+            @abstractor_pieces << CommentRemover.new(self)
             return self
         end
         def abstract!(tokens)
-            if @abstract_identifiers
-                identifier_tokens = tokens.select { |t| t.type == :identifier }
-                identifiers = identifier_tokens.map { |id| id.value }.uniq
-                identifiers.each do |id|
-                    if @dictionary.include?(id)
-                        abstracted_id = @dictionary.index(id)
-                    else
-                        abstracted_id = @dictionary.size
-                        @dictionary << id
-                    end
-                    identifier_tokens.select { |t| t.value == id }.each do |matching_token|
-                        matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
-                    end
-                end
+            @abstractor_pieces.each do |abstractor_piece|
+                tokens = abstractor_piece.abstract(tokens)
             end
-            if @remove_comments
-                tokens.delete_if { |t| t.type == :comment }
-            elsif @abstract_comments
-                tokens.select { |t| t.type == :comment }.each do |comment_token|
-                    comment_token.abstracted_value = Token.special("COMMENT")
-                end
+            return self
+        end
+        def deabstract!(tokens)
+            @abstractor_pieces.each do |abstractor_piece|
+                tokens = abstractor_piece.deabstract(tokens)
             end
-            if @abstract_numbers
-                tokens.select { |t| t.type == :number }.each do |number_token|
-                    number_token.abstracted_value = Token.special("NUMBER")
+            return self
+        end
+    end
+    class AbstractorPiece
+        def initialize(abstractor)
+            @abstractor = abstractor
+        end
+        def abstract(tokens)
+            return tokens
+        end
+        def deabstract(tokens)
+            return tokens
+        end
+    end
+    class IdentifierAbstractor < AbstractorPiece
+        def abstract(tokens)
+            identifier_tokens = tokens.select { |t| t.type == :identifier }
+            identifiers = identifier_tokens.map { |id| id.value }.uniq
+            identifiers.each do |id|
+                if @abstractor.identifiers_dictionary.include?(id)
+                    abstracted_id = @abstractor.identifiers_dictionary.index(id)
+                else
+                    abstracted_id = @abstractor.identifiers_dictionary.size
+                    @abstractor.identifiers_dictionary << id
+                end
+                identifier_tokens.select { |t| t.value == id }.each do |matching_token|
+                    matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
                 end
             end
-            if @abstract_strings
-                tokens.select { |t| t.type == :string }.each do |string_token|
-                    string_token.abstracted_value = Token.special("STRING")
-                end
+            return tokens
+        end
+        def deabstract(tokens)
+            tokens.select { |t| t.abstracted_value.match?(/.ID[0-9]+./) }.each do |token|
+                id = token.abstracted_value.scan(/.ID([0-9]+)./).flatten[0].to_i
+                token.type = :identifier
+                token.value = @abstractor.identifiers_dictionary[id]
             end
-            if @remove_newlines
-                tokens.delete_if { |t| t.type == :newline }
+            return tokens
+        end
+    end
+    class NumberAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :number }.each do |number_token|
+                number_token.abstracted_value = Token.special("NUMBER")
+                @abstractor.numbers_dictionary << number_token.value
             end
-            if @remove_spaces
-                tokens.delete_if { |t| t.type == :space }
-            elsif @abstract_spaces
-                tokens.select { |t| t.type == :space }.each do |space_token|
-                    previous_index = tokens.index(space_token) - 1
-                    if previous_index < 0 || tokens[previous_index].type == :newline
-                        space_token.abstracted_value = Token.special("INDENTATION")
-                    else
-                        space_token.abstracted_value = Token.special("WHITESPACE")
-                    end
+            return tokens
+        end
+        def deabstract(tokens)
+            id = 0
+            tokens.select { |t| t.abstracted_value == Token.special("NUMBER") }.each do |token|
+                token.type = :number
+                token.value = @abstractor.numbers_dictionary[id]
+                id += 1
+            end
+            return tokens
+        end
+    end
+    class StringAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :string }.each do |string_token|
+                string_token.abstracted_value = Token.special("STRING")
+                @abstractor.strings_dictionary << string_token.value
+            end
+            return tokens
+        end
+        def deabstract(tokens)
+            id = 0
+            tokens.select { |t| t.abstracted_value == Token.special("STRING") }.each do |token|
+                token.type = :string
+                token.value = '"' + @abstractor.strings_dictionary[id] + '"'
+                id += 1
+            end
+            return tokens
+        end
+    end
+    class CommentAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :comment }.each do |comment_token|
+                comment_token.abstracted_value = Token.special("COMMENT")
+            end
+            return tokens
+        end
+        def deabstract(tokens)
+            tokens.select { |t| t.abstracted_value == Token.special("COMMENT") }.each do |token|
+                token.type = :comment
+                token.value = 'Unknown comment'
+            end
+            return tokens
+        end
+    end
+    class SpaceAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :space }.each do |space_token|
+                previous_index = tokens.index(space_token) - 1
+                if previous_index < 0 || tokens[previous_index].type == :newline
+                    space_token.abstracted_value = Token.special("INDENTATION")
+                else
+                    space_token.abstracted_value = Token.special("WHITESPACE")
                 end
             end
-            return self
+            return tokens
+        end
+        def deabstract(tokens)
+            tokens.select do |t|
+                t.abstracted_value == Token.special("INDENTATION") ||
+                t.abstracted_value == Token.special("WHITESPACE")
+            end.each do |token|
+                token.type = :space
+                token.value = ' '
+            end
+            return tokens
+        end
+    end
+    class SpaceRemover < AbstractorPiece
+        def abstract(tokens)
+            tokens.delete_if { |t| t.type == :space }
+            return tokens
+        end
+    end
+    class NewlineRemover < AbstractorPiece
+        def abstract(tokens)
+            tokens.delete_if { |t| t.type == :newline }
+            return tokens
+        end
+    end
+    class CommentRemover < AbstractorPiece
+        def abstract(tokens)
+            tokens.delete_if { |t| t.type == :comment }
+            return tokens
         end
     end
 end

data/lib/code-lexer/config.rb CHANGED Viewed

@@ -32,8 +32,7 @@ module CodeLexer
             parsed['lexer'].each do |name, regexs|
                 regexs.each do |regex|
-                    p regex
-                    regex = Regexp.new("^" + regex)
+                    regex = Regexp.new("^" + regex, Regexp::MULTILINE)
                     @rules << [name.to_sym, regex]
                 end
             end

data/lib/code-lexer/languages/java.yml ADDED Viewed

@@ -0,0 +1,38 @@
+lexer:
+    keyword:
+        - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|double|do|else|eval|false|finally|final|float|for|function|goto|if|implements|int|in|instanceof|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throws|throw|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from|strictfp)
+    identifier:
+        - "[$A-Za-z_][$A-Za-z0-9_]*"
+    comment:
+        - \/\/[^\n\r]*(?=[\n\r])
+        - \/\/.*$
+        - \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
+    string:
+        - \"([^"]|\\\")*\"
+        - \'[^']*\'
+    regex:
+        - \/([^/]|\\\/)*\/[gim]*
+    number:
+        - \-?[0-9]*\.[0-9]e\-?[0-9]+
+        - \-?[0-9]*\.[0-9]
+        - \-?[1-9][0-9]*
+        - \-?0[Xx][0-9A-Fa-f]+
+        - \-?[0-9]
+        - \-?0[0-7]+
+    operator:
+        - (\=\=\=|\!\=\=)
+        - (\<\=|\>\=|\=\=|\!\=|\=\>)
+        - (\&\&|\|\||\!)
+        - (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
+        - (\&|\||\~|\^|\<\<|\>\>)
+        - (\=|\+|\-|\/|\*|\%)
+        - (\.|\,|\:)
+        - (\<|\>|\?)
+    parenthesis:
+        - (\(|\)|\[|\]|\{|\})
+    semicolon:
+        - \;
+    newline:
+        - "[\\n\\r]"
+    space:
+        - \s+

data/lib/code-lexer/languages/javascript.yml CHANGED Viewed

@@ -1,15 +1,17 @@
 lexer:
     keyword:
-        - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
+        - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|double|do|else|eval|false|finally|final|float|for|function|goto|if|implements|int|in|instanceof|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throws|throw|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
     identifier:
         - "[$A-Za-z_][$A-Za-z0-9_]*"
     comment:
-        - \/\/[^\n\r]*[\n\r]
+        - \/\/[^\n\r]*(?=[\n\r])
         - \/\/.*$
         - \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
     string:
         - \"([^"]|\\\")*\"
         - \'[^']*\'
+    regex:
+        - \/([^/]|\\\/)*\/[gim]*
     number:
         - \-?[0-9]*\.[0-9]e\-?[0-9]+
         - \-?[0-9]*\.[0-9]
@@ -19,13 +21,13 @@ lexer:
         - \-?0[0-7]+
     operator:
         - (\=\=\=|\!\=\=)
-        - (\<\=|\>\=|\=\=|\!\=)
+        - (\<\=|\>\=|\=\=|\!\=|\=\>)
         - (\&\&|\|\||\!)
         - (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
         - (\&|\||\~|\^|\<\<|\>\>)
         - (\=|\+|\-|\/|\*|\%)
         - (\.|\,|\:)
-        - (\<|\>)
+        - (\<|\>|\?)
     parenthesis:
         - (\(|\)|\[|\]|\{|\})
     semicolon:

data/lib/code-lexer/lexer.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module CodeLexer
             end
         end
-        def lex(content)
+        def lex(content, abstractor = nil)
             content = content.clone
             tokens = []
             while content.length > 0
@@ -23,17 +23,31 @@ module CodeLexer
                 end
             end
-            return LexedContent.new(tokens)
+            return LexedContent.new(tokens, abstractor)
         end
     end
     class LexedContent
         attr_reader     :tokens
+        attr_reader     :abstractor
-        def initialize(tokens)
+        def self.from_stream_string(stream, abstractor)
+            tokens = stream.split(" ").map { |t| Token.from_string(t) }
+            abstractor.deabstract!(tokens)
+            return LexedContent.new(tokens, abstractor)
+        end
+        def initialize(tokens, abstractor = nil)
             @tokens = tokens
+            @abstractor = abstractor
+            @abstractor.abstract!(@tokens) if @abstractor
+        end
+        def reconstruct
+            @tokens.map { |t| t.value.to_s }.join("")
         end
         def token_lines
             result = []
             current_line = []
@@ -53,14 +67,73 @@ module CodeLexer
         end
         def token_stream(abstractor = nil)
-            abstractor.abstract!(@tokens) if abstractor
             result = []
-            @tokens.each do |token|
+            tokens = @tokens
+            if abstractor
+                tokens = tokens.map { |t| t.clone }
+                tokens.each { |t| t.reset_abstraction }
+                abstractor.abstract!(tokens)
+            end
+            tokens.each do |token|
                 result << token.abstracted_value
             end
             return result.join(" ")
         end
+        def to_s
+            @tokens.map { |t| t.value }.join("")
+        end
+        def dump(filename, mode = "w", force = false)
+            if mode.downcase.include?("w") && !force
+                if FileTest.exist?(filename) || FileTest.exist?(lexdata(filename))
+                    raise "Destination filename or lexdata filename already exist."
+                end
+            end
+            File.open(filename, mode) do |f|
+                f << self.token_stream + "\n"
+            end
+            File.open(lexdata(filename), "#{mode}b") do |f|
+                f << Marshal.dump(@abstractor)
+            end
+        end
+        def self.load(file_or_filename, lexdata_or_lexdata_filename = nil)
+            if file_or_filename.is_a?(String) && (lexdata_or_lexdata_filename.is_a?(String) || !lexdata_or_lexdata_filename)
+                unless lexdata_or_lexdata_filename
+                    return self.load_filename(file_or_filename)
+                else
+                    return self.load_filename(file_or_filename, lexdata_or_lexdata_filename)
+                end
+            elsif file_or_filename.is_a?(File) && lexdata_or_lexdata_filename.is_a?(File)
+                return self.load_file(file_or_filename, lexdata_or_lexdata_filename)
+            else
+                raise "Unable to call with the provided input types: expected (String, String), (String), or (File, File)"
+            end
+        end
+        def self.load_filename(filename, lexdata_filename = filename + ".lexdata")
+            File.open(filename, "r") do |file|
+                File.open(lexdata_filename, "rb") do |lexdata_file|
+                    return LexedContent.load_file(file, lexdata_file)
+                end
+            end
+        end
+        def self.load_file(file, lexdata_file)
+            line = file.readline
+            abstractor = Marshal.load(lexdata_file)
+            return LexedContent.from_stream_string(line, abstractor)
+        end
+        private
+        def lexdata(filename)
+            filename + ".lexdata"
+        end
     end
 end

data/lib/code-lexer/token.rb CHANGED Viewed

@@ -11,6 +11,19 @@ module CodeLexer
         attr_accessor :value
         attr_accessor :abstracted_value
+        def self.from_string(string)
+            unless string.start_with?(SPECIAL_TOKEN_OPEN)
+                value = string
+            else
+                value = nil
+            end
+            token = Token.new(:unknown, value)
+            token.abstracted_value = string
+            return token
+        end
         def initialize(type, value)
             @type = type
             self.value = value
@@ -18,13 +31,7 @@ module CodeLexer
         def value=(v)
             @value = v
-            if @type == :newline
-                @abstracted_value = Token.special("NEWLINE")
-            elsif v =~ /\s/
-                @abstracted_value = Token.special(v.gsub(/\s/, "·"))
-            else
-                @abstracted_value = v
-            end
+            self.reset_abstraction
         end
         def to_s
@@ -38,5 +45,15 @@ module CodeLexer
         def ==(oth)
             @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
         end
+        def reset_abstraction
+            if @type == :newline
+                @abstracted_value = Token.special("NEWLINE")
+            elsif @value =~ /\s/
+                @abstracted_value = Token.special(@value.gsub(/\s/, "·"))
+            else
+                @abstracted_value = @value.clone
+            end
+        end
     end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: code-lexer
 version: !ruby/object:Gem::Version
-  version: '0.6'
+  version: '0.8'
 platform: ruby
 authors:
 - Simone Scalabrino
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-12-21 00:00:00.000000000 Z
+date: 2022-11-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: code-assertions
@@ -40,6 +40,7 @@ files:
 - lib/code-lexer.rb
 - lib/code-lexer/abstractor.rb
 - lib/code-lexer/config.rb
+- lib/code-lexer/languages/java.yml
 - lib/code-lexer/languages/javascript.yml
 - lib/code-lexer/lexer.rb
 - lib/code-lexer/token.rb
@@ -62,7 +63,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.32
+rubygems_version: 3.3.7
 signing_key:
 specification_version: 4
 summary: Simple source code lexer