RubyGems - code-lexer - Versions diffs - 0.3 → 0.7 - Mend

code-lexer 0.3 → 0.7

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/code-lexer/abstractor.rb +195 -55
data/lib/code-lexer/config.rb +11 -6
data/lib/code-lexer/languages/javascript.yml +38 -0
data/lib/code-lexer/lexer.rb +80 -7
data/lib/code-lexer/token.rb +24 -7
data/lib/code-lexer.rb +1 -1
metadata +4 -4
data/lib/code-lexer/languages/javascript.clex +0 -25

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1d118a62a0320daf8a38c03fc9133dddbb6c5ab55d1386c323a53fb9b7ace59e
-  data.tar.gz: 6c5e4401890025c6b379e289c9b63a581d4a3731ff62f5f903e95ec153145274
+  metadata.gz: 63422a570f2d8b96e95ee2dfce7f67b797df7aabeca3b1a92c67535d497bd5f7
+  data.tar.gz: 6b2abf7eaf8cc3518de8998b67be1fd4823c4a59db2c879aec04561e6453ac3e
 SHA512:
-  metadata.gz: 116f92a402b3f4077a357a35e3241934b1acc4760569e57ffa14fa07dbd7276f8d020d4c88d4aa8191c1cfcb48140cc4401dcfdb5a8f0e6328383a80afb0041c
-  data.tar.gz: be5d36fe5a1434f9771194584ebe9b45fdf8fdd257e1ab277cd180940448459ca6791e5140a9831078d24542d61185aa89014e37c13f44dfa14e900c33d56e90
+  metadata.gz: 27a3cdf2d95c3e832c48988441e1f5eb466ec595399cbf93676fccaf6b5c6edfb58337782742ebcc1bae6b8052ab05ae3d61e01ad297706d00298a4b22160ff2
+  data.tar.gz: 9b2e9fac2f678751f11018a13589b692871a86b4ab652524c3af9f5c042266f4398ee902e9310b77f5ded49ae8357503d99bf6740ec42de914f1da283817aa75

data/lib/code-lexer/abstractor.rb CHANGED Viewed

@@ -1,11 +1,14 @@
 require_relative 'token'
 module CodeLexer
-    class Abstractor
-        attr_reader     :dictionary
-        def initialize(dictionary=[])
-            @dictionary = ["NOOP"] + dictionary
+    class Abstractor
+        def initialize(identifiers_dictionary = [], strings_dictionary = [], numbers_dictionary = [])
+            @dictionary = {}
+            @dictionary[:identifiers] = ['NOOP'] + identifiers_dictionary
+            @dictionary[:strings] = strings_dictionary
+            @dictionary[:numbers] = numbers_dictionary
+            @abstractor_pieces = []
         end
         def abstract_everything
@@ -18,103 +21,240 @@ module CodeLexer
             return self
         end
+        def dictionary
+            warn "[DEPRECATION] The method CodeLexer::Abstractor#dictionary is deprecated; used CodeLexer::Abstractor#identifiers_dictionary instead"
+            self.identifiers_dictionary
+        end
+        def identifiers_dictionary
+            @dictionary[:identifiers]
+        end
+        def strings_dictionary
+            @dictionary[:strings]
+        end
+        def numbers_dictionary
+            @dictionary[:numbers]
+        end
+        def dictionaries
+            @dictionary
+        end
         def abstract_identifiers
-            @abstract_identifiers = true
+            @abstractor_pieces << IdentifierAbstractor.new(self)
             return self
         end
         def abstract_numbers
-            @abstract_numbers = true
+            @abstractor_pieces << NumberAbstractor.new(self)
             return self
         end
         def abstract_comments
-            @abstract_comments = true
+            @abstractor_pieces << CommentAbstractor.new(self)
             return self
         end
         def abstract_strings
-            @abstract_strings = true
+            @abstractor_pieces << StringAbstractor.new(self)
             return self
         end
         def abstract_spaces
-            @abstract_spaces = true
+            @abstractor_pieces << SpaceAbstractor.new(self)
             return self
         end
         def remove_spaces
-            @remove_spaces = true
+            @abstractor_pieces << SpaceRemover.new(self)
             return self
         end
         def remove_newlines
-            @remove_newlines = true
+            @abstractor_pieces << NewlineRemover.new(self)
             return self
         end
         def remove_comments
-            @remove_comments = true
+            @abstractor_pieces << CommentRemover.new(self)
             return self
         end
         def abstract!(tokens)
-            if @abstract_identifiers
-                identifier_tokens = tokens.select { |t| t.type == :identifier }
-                identifiers = identifier_tokens.map { |id| id.value }.uniq
-                identifiers.each do |id|
-                    if @dictionary.include?(id)
-                        abstracted_id = @dictionary.index(id)
-                    else
-                        abstracted_id = @dictionary.size
-                        @dictionary << id
-                    end
-                    identifier_tokens.select { |t| t.value == id }.each do |matching_token|
-                        matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
-                    end
-                end
+            @abstractor_pieces.each do |abstractor_piece|
+                tokens = abstractor_piece.abstract(tokens)
             end
-            if @remove_comments
-                tokens.delete_if { |t| t.type == :comment }
-            elsif @abstract_comments
-                tokens.select { |t| t.type == :comment }.each do |comment_token|
-                    comment_token.abstracted_value = Token.special("COMMENT")
-                end
+            return self
+        end
+        def deabstract!(tokens)
+            @abstractor_pieces.each do |abstractor_piece|
+                tokens = abstractor_piece.deabstract(tokens)
             end
-            if @abstract_numbers
-                tokens.select { |t| t.type == :number }.each do |number_token|
-                    number_token.abstracted_value = Token.special("NUMBER")
+            return self
+        end
+    end
+    class AbstractorPiece
+        def initialize(abstractor)
+            @abstractor = abstractor
+        end
+        def abstract(tokens)
+            return tokens
+        end
+        def deabstract(tokens)
+            return tokens
+        end
+    end
+    class IdentifierAbstractor < AbstractorPiece
+        def abstract(tokens)
+            identifier_tokens = tokens.select { |t| t.type == :identifier }
+            identifiers = identifier_tokens.map { |id| id.value }.uniq
+            identifiers.each do |id|
+                if @abstractor.identifiers_dictionary.include?(id)
+                    abstracted_id = @abstractor.identifiers_dictionary.index(id)
+                else
+                    abstracted_id = @abstractor.identifiers_dictionary.size
+                    @abstractor.identifiers_dictionary << id
+                end
+                identifier_tokens.select { |t| t.value == id }.each do |matching_token|
+                    matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
                 end
             end
-            if @abstract_strings
-                tokens.select { |t| t.type == :string }.each do |string_token|
-                    string_token.abstracted_value = Token.special("STRING")
-                end
+            return tokens
+        end
+        def deabstract(tokens)
+            tokens.select { |t| t.abstracted_value.match?(/.ID[0-9]+./) }.each do |token|
+                id = token.abstracted_value.scan(/.ID([0-9]+)./).flatten[0].to_i
+                token.type = :identifier
+                token.value = @abstractor.identifiers_dictionary[id]
             end
-            if @remove_newlines
-                tokens.delete_if { |t| t.type == :newline }
+            return tokens
+        end
+    end
+    class NumberAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :number }.each do |number_token|
+                number_token.abstracted_value = Token.special("NUMBER")
+                @abstractor.numbers_dictionary << number_token.value
             end
-            if @remove_spaces
-                tokens.delete_if { |t| t.type == :space }
-            elsif @abstract_spaces
-                tokens.select { |t| t.type == :space }.each do |space_token|
-                    previous_index = tokens.index(space_token) - 1
-                    if previous_index < 0 || tokens[previous_index].type == :newline
-                        space_token.abstracted_value = Token.special("INDENTATION")
-                    else
-                        space_token.abstracted_value = Token.special("WHITESPACE")
-                    end
+            return tokens
+        end
+        def deabstract(tokens)
+            id = 0
+            tokens.select { |t| t.abstracted_value == Token.special("NUMBER") }.each do |token|
+                token.type = :number
+                token.value = @abstractor.numbers_dictionary[id]
+                id += 1
+            end
+            return tokens
+        end
+    end
+    class StringAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :string }.each do |string_token|
+                string_token.abstracted_value = Token.special("STRING")
+                @abstractor.strings_dictionary << string_token.value
+            end
+            return tokens
+        end
+        def deabstract(tokens)
+            id = 0
+            tokens.select { |t| t.abstracted_value == Token.special("STRING") }.each do |token|
+                token.type = :string
+                token.value = '"' + @abstractor.strings_dictionary[id] + '"'
+                id += 1
+            end
+            return tokens
+        end
+    end
+    class CommentAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :comment }.each do |comment_token|
+                comment_token.abstracted_value = Token.special("COMMENT")
+            end
+            return tokens
+        end
+        def deabstract(tokens)
+            tokens.select { |t| t.abstracted_value == Token.special("COMMENT") }.each do |token|
+                token.type = :comment
+                token.value = 'Unknown comment'
+            end
+            return tokens
+        end
+    end
+    class SpaceAbstractor < AbstractorPiece
+        def abstract(tokens)
+            tokens.select { |t| t.type == :space }.each do |space_token|
+                previous_index = tokens.index(space_token) - 1
+                if previous_index < 0 || tokens[previous_index].type == :newline
+                    space_token.abstracted_value = Token.special("INDENTATION")
+                else
+                    space_token.abstracted_value = Token.special("WHITESPACE")
                 end
             end
-            return self
+            return tokens
+        end
+        def deabstract(tokens)
+            tokens.select do |t|
+                t.abstracted_value == Token.special("INDENTATION") ||
+                t.abstracted_value == Token.special("WHITESPACE")
+            end.each do |token|
+                token.type = :space
+                token.value = ' '
+            end
+            return tokens
+        end
+    end
+    class SpaceRemover < AbstractorPiece
+        def abstract(tokens)
+            tokens.delete_if { |t| t.type == :space }
+            return tokens
+        end
+    end
+    class NewlineRemover < AbstractorPiece
+        def abstract(tokens)
+            tokens.delete_if { |t| t.type == :newline }
+            return tokens
+        end
+    end
+    class CommentRemover < AbstractorPiece
+        def abstract(tokens)
+            tokens.delete_if { |t| t.type == :comment }
+            return tokens
         end
     end
 end

data/lib/code-lexer/config.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'yaml'
 module CodeLexer
     class Config
         attr_reader     :rules
@@ -5,7 +7,7 @@ module CodeLexer
             @config = File.basename(path)
             @rules = []
-            load_rules(File.read(path))
+            load_rules(path)
         end
         def matching_rule(text)
@@ -25,11 +27,14 @@ module CodeLexer
         private
         def load_rules(content)
-            content.split("\n").each do |line|
-                name, regex = line.split(":", 2)
-                regex = Regexp.new("^" + regex)
-                @rules << [name.to_sym, regex]
+            parsed = YAML.load_file(content)
+            parsed['lexer'].each do |name, regexs|
+                regexs.each do |regex|
+                    regex = Regexp.new("^" + regex, Regexp::MULTILINE)
+                    @rules << [name.to_sym, regex]
+                end
             end
             @rules << [:other, /./]

data/lib/code-lexer/languages/javascript.yml ADDED Viewed

@@ -0,0 +1,38 @@
+lexer:
+    keyword:
+        - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
+    identifier:
+        - "[$A-Za-z_][$A-Za-z0-9_]*"
+    comment:
+        - \/\/[^\n\r]*(?=[\n\r])
+        - \/\/.*$
+        - \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
+    string:
+        - \"([^"]|\\\")*\"
+        - \'[^']*\'
+    regex:
+        - \/([^/]|\\\/)*\/[gim]*
+    number:
+        - \-?[0-9]*\.[0-9]e\-?[0-9]+
+        - \-?[0-9]*\.[0-9]
+        - \-?[1-9][0-9]*
+        - \-?0[Xx][0-9A-Fa-f]+
+        - \-?[0-9]
+        - \-?0[0-7]+
+    operator:
+        - (\=\=\=|\!\=\=)
+        - (\<\=|\>\=|\=\=|\!\=|\=\>)
+        - (\&\&|\|\||\!)
+        - (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
+        - (\&|\||\~|\^|\<\<|\>\>)
+        - (\=|\+|\-|\/|\*|\%)
+        - (\.|\,|\:)
+        - (\<|\>|\?)
+    parenthesis:
+        - (\(|\)|\[|\]|\{|\})
+    semicolon:
+        - \;
+    newline:
+        - "[\\n\\r]"
+    space:
+        - \s+

data/lib/code-lexer/lexer.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module CodeLexer
             end
         end
-        def lex(content)
+        def lex(content, abstractor = nil)
             content = content.clone
             tokens = []
             while content.length > 0
@@ -23,17 +23,31 @@ module CodeLexer
                 end
             end
-            return LexedContent.new(tokens)
+            return LexedContent.new(tokens, abstractor)
         end
     end
     class LexedContent
         attr_reader     :tokens
+        attr_reader     :abstractor
-        def initialize(tokens)
+        def self.from_stream_string(stream, abstractor)
+            tokens = stream.split(" ").map { |t| Token.from_string(t) }
+            abstractor.deabstract!(tokens)
+            return LexedContent.new(tokens, abstractor)
+        end
+        def initialize(tokens, abstractor = nil)
             @tokens = tokens
+            @abstractor = abstractor
+            @abstractor.abstract!(@tokens) if @abstractor
+        end
+        def reconstruct
+            @tokens.map { |t| t.value.to_s }.join("")
         end
         def token_lines
             result = []
             current_line = []
@@ -53,14 +67,73 @@ module CodeLexer
         end
         def token_stream(abstractor = nil)
-            abstractor.abstract!(@tokens) if abstractor
             result = []
-            @tokens.each do |token|
+            tokens = @tokens
+            if abstractor
+                tokens = tokens.map { |t| t.clone }
+                tokens.each { |t| t.reset_abstraction }
+                abstractor.abstract!(tokens)
+            end
+            tokens.each do |token|
                 result << token.abstracted_value
             end
             return result.join(" ")
         end
+        def to_s
+            @tokens.map { |t| t.value }.join("")
+        end
+        def dump(filename, mode = "w", force = false)
+            if mode.downcase.include?("w") && !force
+                if FileTest.exist?(filename) || FileTest.exist?(lexdata(filename))
+                    raise "Destination filename or lexdata filename already exist."
+                end
+            end
+            File.open(filename, mode) do |f|
+                f << self.token_stream + "\n"
+            end
+            File.open(lexdata(filename), "#{mode}b") do |f|
+                f << Marshal.dump(@abstractor)
+            end
+        end
+        def self.load(file_or_filename, lexdata_or_lexdata_filename = nil)
+            if file_or_filename.is_a?(String) && (lexdata_or_lexdata_filename.is_a?(String) || !lexdata_or_lexdata_filename)
+                unless lexdata_or_lexdata_filename
+                    return self.load_filename(file_or_filename)
+                else
+                    return self.load_filename(file_or_filename, lexdata_or_lexdata_filename)
+                end
+            elsif file_or_filename.is_a?(File) && lexdata_or_lexdata_filename.is_a?(File)
+                return self.load_file(file_or_filename, lexdata_or_lexdata_filename)
+            else
+                raise "Unable to call with the provided input types: expected (String, String), (String), or (File, File)"
+            end
+        end
+        def self.load_filename(filename, lexdata_filename = filename + ".lexdata")
+            File.open(filename, "r") do |file|
+                File.open(lexdata_filename, "rb") do |lexdata_file|
+                    return LexedContent.load_file(file, lexdata_file)
+                end
+            end
+        end
+        def self.load_file(file, lexdata_file)
+            line = file.readline
+            abstractor = Marshal.load(lexdata_file)
+            return LexedContent.from_stream_string(line, abstractor)
+        end
+        private
+        def lexdata(filename)
+            filename + ".lexdata"
+        end
     end
 end

data/lib/code-lexer/token.rb CHANGED Viewed

@@ -11,6 +11,19 @@ module CodeLexer
         attr_accessor :value
         attr_accessor :abstracted_value
+        def self.from_string(string)
+            unless string.start_with?(SPECIAL_TOKEN_OPEN)
+                value = string
+            else
+                value = nil
+            end
+            token = Token.new(:unknown, value)
+            token.abstracted_value = string
+            return token
+        end
         def initialize(type, value)
             @type = type
             self.value = value
@@ -18,13 +31,7 @@ module CodeLexer
         def value=(v)
             @value = v
-            if @type == :newline
-                @abstracted_value = Token.special("NEWLINE")
-            elsif v =~ /\s/
-                @abstracted_value = Token.special(v.gsub(/\s/, "·"))
-            else
-                @abstracted_value = v
-            end
+            self.reset_abstraction
         end
         def to_s
@@ -38,5 +45,15 @@ module CodeLexer
         def ==(oth)
             @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
         end
+        def reset_abstraction
+            if @type == :newline
+                @abstracted_value = Token.special("NEWLINE")
+            elsif @value =~ /\s/
+                @abstracted_value = Token.special(@value.gsub(/\s/, "·"))
+            else
+                @abstracted_value = @value.clone
+            end
+        end
     end
 end

data/lib/code-lexer.rb CHANGED Viewed

@@ -5,6 +5,6 @@ require_relative 'code-lexer/token'
 module CodeLexer
     def self.get(language)
-        return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
+        return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.yml")
     end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: code-lexer
 version: !ruby/object:Gem::Version
-  version: '0.3'
+  version: '0.7'
 platform: ruby
 authors:
 - Simone Scalabrino
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-11-28 00:00:00.000000000 Z
+date: 2022-01-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: code-assertions
@@ -40,7 +40,7 @@ files:
 - lib/code-lexer.rb
 - lib/code-lexer/abstractor.rb
 - lib/code-lexer/config.rb
-- lib/code-lexer/languages/javascript.clex
+- lib/code-lexer/languages/javascript.yml
 - lib/code-lexer/lexer.rb
 - lib/code-lexer/token.rb
 homepage: https://github.com/intersimone999/code-lexer
@@ -62,7 +62,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.32
+rubygems_version: 3.3.3
 signing_key:
 specification_version: 4
 summary: Simple source code lexer

data/lib/code-lexer/languages/javascript.clex DELETED Viewed

@@ -1,25 +0,0 @@
-keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
-identifier:[$A-Za-z_][$A-Za-z0-9_]*
-comment:\/\/[^.]*[\n\r]
-comment:\/\/[^.]*$
-comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
-string:\"([^"]|\\\")*\"
-string:\'[^']*\'
-number:\-?[0-9]
-number:\-?[1-9][0-9]*
-number:\-?[0-9]*\.[0-9]
-number:\-?[0-9]*\.[0-9]e\-?[0-9]+
-number:\-?0[Xx][0-9A-Fa-f]+
-number:\-?0[0-7]+
-operator:(\=\=\=|\!\=\=)
-operator:(\<\=|\>\=|\=\=|\!\=)
-operator:(\&\&|\|\||\!)
-operator:(\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
-operator:(\&|\||\~|\^|\<\<|\>\>)
-operator:(\=|\+|\-|\/|\*|\%)
-operator:(\.|\,|\:)
-operator:(\<|\>)
-parenthesis:(\(|\)|\[|\]|\{|\})
-semicolon:\;
-newline:[\n\r]
-space:\s+