code-lexer 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/code-lexer/abstractor.rb +110 -0
- data/lib/code-lexer/config.rb +38 -0
- data/lib/code-lexer/languages/javascript.clex +24 -0
- data/lib/code-lexer/lexer.rb +66 -0
- data/lib/code-lexer/token.rb +42 -0
- data/lib/code-lexer.rb +10 -0
- metadata +69 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: d6c98649f07e77d4148fb744db9b79fb0ed714113d6aaee7cf02f249868070c9
         | 
| 4 | 
            +
              data.tar.gz: 6c6deb5e8f6778a036dd60cf49d649192620455d0197358b4783e0df0dd91bce
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 60556343e374a1c7ea58a076473fa98e9a15f4b8a89451788675a2415eb7c2da05f9255463abe9a2ae75da239c84a6186f8d5bc7aa570dd16580bdd8e685a7e7
         | 
| 7 | 
            +
              data.tar.gz: 974dcf39a0a41c496f61429dd0e5b16557a80be856fef450dddac29a19ca72dc4b4ed0c33a2ff9226a4c683af665b44d1b26c2b36ac49d15cd96d47cc350cd20
         | 
| @@ -0,0 +1,110 @@ | |
| 1 | 
            +
            require_relative 'token'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module CodeLexer
         | 
| 4 | 
            +
                class Abstractor
         | 
| 5 | 
            +
                    attr_reader     :dictionary
         | 
| 6 | 
            +
                    
         | 
| 7 | 
            +
                    def initialize(dictionary=[])
         | 
| 8 | 
            +
                        @dictionary = ["NOOP"] + dictionary
         | 
| 9 | 
            +
                    end
         | 
| 10 | 
            +
                    
         | 
| 11 | 
            +
                    def abstract_identifiers
         | 
| 12 | 
            +
                        @abstract_identifiers = true
         | 
| 13 | 
            +
                        return self
         | 
| 14 | 
            +
                    end
         | 
| 15 | 
            +
                    
         | 
| 16 | 
            +
                    def abstract_numbers
         | 
| 17 | 
            +
                        @abstract_numbers = true
         | 
| 18 | 
            +
                        return self
         | 
| 19 | 
            +
                    end
         | 
| 20 | 
            +
                    
         | 
| 21 | 
            +
                    def abstract_comments
         | 
| 22 | 
            +
                        @abstract_comments = true
         | 
| 23 | 
            +
                        return self
         | 
| 24 | 
            +
                    end
         | 
| 25 | 
            +
                    
         | 
| 26 | 
            +
                    def abstract_strings
         | 
| 27 | 
            +
                        @abstract_strings = true
         | 
| 28 | 
            +
                        return self
         | 
| 29 | 
            +
                    end
         | 
| 30 | 
            +
                    
         | 
| 31 | 
            +
                    def abstract_spaces
         | 
| 32 | 
            +
                        @abstract_spaces = true
         | 
| 33 | 
            +
                        return self
         | 
| 34 | 
            +
                    end
         | 
| 35 | 
            +
                    
         | 
| 36 | 
            +
                    def remove_spaces
         | 
| 37 | 
            +
                        @remove_spaces = true
         | 
| 38 | 
            +
                        return self
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                    
         | 
| 41 | 
            +
                    def remove_newlines
         | 
| 42 | 
            +
                        @remove_newlines = true
         | 
| 43 | 
            +
                        return self
         | 
| 44 | 
            +
                    end
         | 
| 45 | 
            +
                    
         | 
| 46 | 
            +
                    def remove_comments
         | 
| 47 | 
            +
                        @remove_comments = true
         | 
| 48 | 
            +
                        return self
         | 
| 49 | 
            +
                    end
         | 
| 50 | 
            +
                    
         | 
| 51 | 
            +
                    def abstract!(tokens)
         | 
| 52 | 
            +
                        if @abstract_identifiers
         | 
| 53 | 
            +
                            identifier_tokens = tokens.select { |t| t.type == :identifier }
         | 
| 54 | 
            +
                            identifiers = identifier_tokens.map { |id| id.value }.uniq
         | 
| 55 | 
            +
                            
         | 
| 56 | 
            +
                            identifiers.each do |id|
         | 
| 57 | 
            +
                                if @dictionary.include?(id)
         | 
| 58 | 
            +
                                    abstracted_id = @dictionary.index(id)
         | 
| 59 | 
            +
                                else
         | 
| 60 | 
            +
                                    abstracted_id = @dictionary.size
         | 
| 61 | 
            +
                                    @dictionary << id 
         | 
| 62 | 
            +
                                end
         | 
| 63 | 
            +
                                    
         | 
| 64 | 
            +
                                identifier_tokens.select { |t| t.value == id }.each do |matching_token|
         | 
| 65 | 
            +
                                    matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
         | 
| 66 | 
            +
                                end
         | 
| 67 | 
            +
                            end
         | 
| 68 | 
            +
                        end
         | 
| 69 | 
            +
                        
         | 
| 70 | 
            +
                        if @remove_comments
         | 
| 71 | 
            +
                            tokens.delete_if { |t| t.type == :comment }
         | 
| 72 | 
            +
                        elsif @abstract_comments
         | 
| 73 | 
            +
                            tokens.select { |t| t.type == :comment }.each do |comment_token|
         | 
| 74 | 
            +
                                comment_token.abstracted_value = Token.special("COMMENT")
         | 
| 75 | 
            +
                            end
         | 
| 76 | 
            +
                        end
         | 
| 77 | 
            +
                        
         | 
| 78 | 
            +
                        if @abstract_numbers
         | 
| 79 | 
            +
                            tokens.select { |t| t.type == :number }.each do |number_token|
         | 
| 80 | 
            +
                                number_token.abstracted_value = Token.special("NUMBER")
         | 
| 81 | 
            +
                            end
         | 
| 82 | 
            +
                        end
         | 
| 83 | 
            +
                        
         | 
| 84 | 
            +
                        if @abstract_strings
         | 
| 85 | 
            +
                            tokens.select { |t| t.type == :string }.each do |string_token|
         | 
| 86 | 
            +
                                string_token.abstracted_value = Token.special("STRING")
         | 
| 87 | 
            +
                            end
         | 
| 88 | 
            +
                        end
         | 
| 89 | 
            +
                        
         | 
| 90 | 
            +
                        if @remove_newlines
         | 
| 91 | 
            +
                            tokens.delete_if { |t| t.type == :newline }
         | 
| 92 | 
            +
                        end
         | 
| 93 | 
            +
                        
         | 
| 94 | 
            +
                        if @remove_spaces
         | 
| 95 | 
            +
                            tokens.delete_if { |t| t.type == :space }
         | 
| 96 | 
            +
                        elsif @abstract_spaces
         | 
| 97 | 
            +
                            tokens.select { |t| t.type == :space }.each do |space_token|
         | 
| 98 | 
            +
                                previous_index = tokens.index(space_token) - 1
         | 
| 99 | 
            +
                                if previous_index < 0 || tokens[previous_index].type == :newline
         | 
| 100 | 
            +
                                    space_token.abstracted_value = Token.special("INDENTATION")
         | 
| 101 | 
            +
                                else
         | 
| 102 | 
            +
                                    space_token.abstracted_value = Token.special("WHITESPACE")
         | 
| 103 | 
            +
                                end
         | 
| 104 | 
            +
                            end
         | 
| 105 | 
            +
                        end
         | 
| 106 | 
            +
                        
         | 
| 107 | 
            +
                        return self
         | 
| 108 | 
            +
                    end
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
            end
         | 
| @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            module CodeLexer
         | 
| 2 | 
            +
                class Config
         | 
| 3 | 
            +
                    attr_reader     :rules
         | 
| 4 | 
            +
                    def initialize(path)
         | 
| 5 | 
            +
                        @config = File.basename(path)
         | 
| 6 | 
            +
                        @rules = []
         | 
| 7 | 
            +
                        
         | 
| 8 | 
            +
                        load_rules(File.read(path))
         | 
| 9 | 
            +
                    end
         | 
| 10 | 
            +
                    
         | 
| 11 | 
            +
                    def matching_rule(text)
         | 
| 12 | 
            +
                        min_score = 10000
         | 
| 13 | 
            +
                        min_couple = []
         | 
| 14 | 
            +
                        @rules.each do |name, regex|
         | 
| 15 | 
            +
                            if (score = (text =~ regex))
         | 
| 16 | 
            +
                                if score < min_score
         | 
| 17 | 
            +
                                    min_score = score
         | 
| 18 | 
            +
                                    min_couple = [name, regex]
         | 
| 19 | 
            +
                                end
         | 
| 20 | 
            +
                            end
         | 
| 21 | 
            +
                        end
         | 
| 22 | 
            +
                        
         | 
| 23 | 
            +
                        return *min_couple
         | 
| 24 | 
            +
                    end
         | 
| 25 | 
            +
                    
         | 
| 26 | 
            +
                    private
         | 
| 27 | 
            +
                    def load_rules(content)
         | 
| 28 | 
            +
                        content.split("\n").each do |line|
         | 
| 29 | 
            +
                            name, regex = line.split(":", 2)
         | 
| 30 | 
            +
                            regex = Regexp.new("^" + regex)
         | 
| 31 | 
            +
                            
         | 
| 32 | 
            +
                            @rules << [name.to_sym, regex]
         | 
| 33 | 
            +
                        end
         | 
| 34 | 
            +
                        
         | 
| 35 | 
            +
                        @rules << [:other, /./]
         | 
| 36 | 
            +
                    end
         | 
| 37 | 
            +
                end 
         | 
| 38 | 
            +
            end
         | 
| @@ -0,0 +1,24 @@ | |
| 1 | 
            +
            keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
         | 
| 2 | 
            +
            identifier:[$A-Za-z_][$A-Za-z0-9_]*
         | 
| 3 | 
            +
            comment:\/\/[^.]*[\n\r]
         | 
| 4 | 
            +
            comment:\/\/[^.]*$
         | 
| 5 | 
            +
            comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
         | 
| 6 | 
            +
            string:\"([^"]|\\\")*\"
         | 
| 7 | 
            +
            string:\'[^']*\'
         | 
| 8 | 
            +
            number:\-?[0-9]
         | 
| 9 | 
            +
            number:\-?[1-9][0-9]*
         | 
| 10 | 
            +
            number:\-?[0-9]*\.[0-9]
         | 
| 11 | 
            +
            number:\-?[0-9]*\.[0-9]e\-?[0-9]+
         | 
| 12 | 
            +
            number:\-?0[Xx][0-9A-Fa-f]+
         | 
| 13 | 
            +
            number:\-?0[0-7]+
         | 
| 14 | 
            +
            operator:(\<\=|\>\=|\=\=|\=\=\=|\!\=\=|\!\=)
         | 
| 15 | 
            +
            operator:(\&\&|\||\|\||\!)
         | 
| 16 | 
            +
            operator:(\=|\+\=|\-\=|\/\=|\*\=|\%\=)
         | 
| 17 | 
            +
            operator:(\&|\||\~|\^|\<\<|\>\>)
         | 
| 18 | 
            +
            operator:(\+|\-|\/|\*|\%|\+\+|\-\-)
         | 
| 19 | 
            +
            operator:(\.|\,|\:)
         | 
| 20 | 
            +
            operator:(\<|\>)
         | 
| 21 | 
            +
            parenthesis:(\(|\)|\[|\]|\{|\})
         | 
| 22 | 
            +
            semicolon:\;
         | 
| 23 | 
            +
            newline:[\n\r]
         | 
| 24 | 
            +
            space:\s+
         | 
| @@ -0,0 +1,66 @@ | |
| 1 | 
            +
            require_relative 'token'
         | 
| 2 | 
            +
            require_relative 'abstractor'
         | 
| 3 | 
            +
            require_relative 'config'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module CodeLexer
         | 
| 6 | 
            +
                class Lexer
         | 
| 7 | 
            +
                    def initialize(config_path_or_config)
         | 
| 8 | 
            +
                        if config_path_or_config.is_a?(Config)
         | 
| 9 | 
            +
                            @config = config_path_or_config
         | 
| 10 | 
            +
                        else
         | 
| 11 | 
            +
                            @config = Config.new(config_path_or_config)
         | 
| 12 | 
            +
                        end
         | 
| 13 | 
            +
                    end
         | 
| 14 | 
            +
                    
         | 
| 15 | 
            +
                    def lex(content)
         | 
| 16 | 
            +
                        content = content.clone
         | 
| 17 | 
            +
                        tokens = []
         | 
| 18 | 
            +
                        while content.length > 0
         | 
| 19 | 
            +
                            token_name, regex = @config.matching_rule(content)
         | 
| 20 | 
            +
                            content.sub!(regex) do |value|
         | 
| 21 | 
            +
                                tokens << Token.new(token_name, value)
         | 
| 22 | 
            +
                                ""
         | 
| 23 | 
            +
                            end
         | 
| 24 | 
            +
                        end
         | 
| 25 | 
            +
                        
         | 
| 26 | 
            +
                        return LexedContent.new(tokens)
         | 
| 27 | 
            +
                    end
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
                
         | 
| 30 | 
            +
                class LexedContent
         | 
| 31 | 
            +
                    attr_reader     :tokens
         | 
| 32 | 
            +
                    
         | 
| 33 | 
            +
                    def initialize(tokens)
         | 
| 34 | 
            +
                        @tokens = tokens
         | 
| 35 | 
            +
                    end
         | 
| 36 | 
            +
                            
         | 
| 37 | 
            +
                    def token_lines
         | 
| 38 | 
            +
                        result = []
         | 
| 39 | 
            +
                        current_line = []
         | 
| 40 | 
            +
                        @tokens.each do |t|
         | 
| 41 | 
            +
                            if t.type == :newline
         | 
| 42 | 
            +
                                result << current_line
         | 
| 43 | 
            +
                                current_line = []
         | 
| 44 | 
            +
                            else
         | 
| 45 | 
            +
                                current_line << t
         | 
| 46 | 
            +
                            end
         | 
| 47 | 
            +
                        end
         | 
| 48 | 
            +
                        
         | 
| 49 | 
            +
                        result << current_line
         | 
| 50 | 
            +
                        result.delete_if { |line| line.empty? }
         | 
| 51 | 
            +
                        
         | 
| 52 | 
            +
                        return result
         | 
| 53 | 
            +
                    end
         | 
| 54 | 
            +
                    
         | 
| 55 | 
            +
                    def token_stream(abstractor = nil)
         | 
| 56 | 
            +
                        abstractor.abstract!(@tokens) if abstractor
         | 
| 57 | 
            +
                        
         | 
| 58 | 
            +
                        result = []
         | 
| 59 | 
            +
                        @tokens.each do |token|
         | 
| 60 | 
            +
                            result << token.abstracted_value
         | 
| 61 | 
            +
                        end
         | 
| 62 | 
            +
                        
         | 
| 63 | 
            +
                        return result.join(" ")
         | 
| 64 | 
            +
                    end
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
            end
         | 
| @@ -0,0 +1,42 @@ | |
| 1 | 
            +
            module CodeLexer
         | 
| 2 | 
            +
                class Token
         | 
| 3 | 
            +
                    SPECIAL_TOKEN_OPEN  = "¬"
         | 
| 4 | 
            +
                    SPECIAL_TOKEN_CLOSE = "¬"
         | 
| 5 | 
            +
                    
         | 
| 6 | 
            +
                    def self.special(token)
         | 
| 7 | 
            +
                        "#{SPECIAL_TOKEN_OPEN}#{token}#{SPECIAL_TOKEN_CLOSE}"
         | 
| 8 | 
            +
                    end
         | 
| 9 | 
            +
                    
         | 
| 10 | 
            +
                    attr_accessor :type
         | 
| 11 | 
            +
                    attr_accessor :value
         | 
| 12 | 
            +
                    attr_accessor :abstracted_value
         | 
| 13 | 
            +
                    
         | 
| 14 | 
            +
                    def initialize(type, value)
         | 
| 15 | 
            +
                        @type = type
         | 
| 16 | 
            +
                        self.value = value
         | 
| 17 | 
            +
                    end
         | 
| 18 | 
            +
                    
         | 
| 19 | 
            +
                    def value=(v)
         | 
| 20 | 
            +
                        @value = v
         | 
| 21 | 
            +
                        if @type == :newline
         | 
| 22 | 
            +
                            @abstracted_value = Token.special("NEWLINE")
         | 
| 23 | 
            +
                        elsif v =~ /\s/
         | 
| 24 | 
            +
                            @abstracted_value = Token.special(v.gsub(/\s/, "·"))
         | 
| 25 | 
            +
                        else
         | 
| 26 | 
            +
                            @abstracted_value = v
         | 
| 27 | 
            +
                        end
         | 
| 28 | 
            +
                    end
         | 
| 29 | 
            +
                    
         | 
| 30 | 
            +
                    def to_s
         | 
| 31 | 
            +
                        if @abstracted_value != @value
         | 
| 32 | 
            +
                            return "<#@type:#{@value.inspect}:#{@abstracted_value.inspect}>"
         | 
| 33 | 
            +
                        else
         | 
| 34 | 
            +
                            return "<#@type:#{@value.inspect}>"
         | 
| 35 | 
            +
                        end
         | 
| 36 | 
            +
                    end
         | 
| 37 | 
            +
                            
         | 
| 38 | 
            +
                    def ==(oth)
         | 
| 39 | 
            +
                        @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
         | 
| 40 | 
            +
                    end
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
            end
         | 
    
        data/lib/code-lexer.rb
    ADDED
    
    | @@ -0,0 +1,10 @@ | |
| 1 | 
            +
            require_relative 'code-lexer/config'
         | 
| 2 | 
            +
            require_relative 'code-lexer/abstractor'
         | 
| 3 | 
            +
            require_relative 'code-lexer/lexer'
         | 
| 4 | 
            +
            require_relative 'code-lexer/token'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module CodeLexer
         | 
| 7 | 
            +
                def self.get(language)
         | 
| 8 | 
            +
                    return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
         | 
| 9 | 
            +
                end
         | 
| 10 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,69 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: code-lexer
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: '0.1'
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors:
         | 
| 7 | 
            +
            - Simone Scalabrino
         | 
| 8 | 
            +
            autorequire:
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
            date: 2021-11-28 00:00:00.000000000 Z
         | 
| 12 | 
            +
            dependencies:
         | 
| 13 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 14 | 
            +
              name: code-assertions
         | 
| 15 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 | 
            +
                requirements:
         | 
| 17 | 
            +
                - - "~>"
         | 
| 18 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            +
                    version: 1.1.2
         | 
| 20 | 
            +
                - - ">="
         | 
| 21 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 22 | 
            +
                    version: 1.1.2
         | 
| 23 | 
            +
              type: :runtime
         | 
| 24 | 
            +
              prerelease: false
         | 
| 25 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 26 | 
            +
                requirements:
         | 
| 27 | 
            +
                - - "~>"
         | 
| 28 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 29 | 
            +
                    version: 1.1.2
         | 
| 30 | 
            +
                - - ">="
         | 
| 31 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 32 | 
            +
                    version: 1.1.2
         | 
| 33 | 
            +
            description: Source code lexer configurable for any programming language that allows
         | 
| 34 | 
            +
              to tokenize and abstract a given source file
         | 
| 35 | 
            +
            email: s.scalabrino9@gmail.com
         | 
| 36 | 
            +
            executables: []
         | 
| 37 | 
            +
            extensions: []
         | 
| 38 | 
            +
            extra_rdoc_files: []
         | 
| 39 | 
            +
            files:
         | 
| 40 | 
            +
            - lib/code-lexer.rb
         | 
| 41 | 
            +
            - lib/code-lexer/abstractor.rb
         | 
| 42 | 
            +
            - lib/code-lexer/config.rb
         | 
| 43 | 
            +
            - lib/code-lexer/languages/javascript.clex
         | 
| 44 | 
            +
            - lib/code-lexer/lexer.rb
         | 
| 45 | 
            +
            - lib/code-lexer/token.rb
         | 
| 46 | 
            +
            homepage: https://github.com/intersimone999/code-lexer
         | 
| 47 | 
            +
            licenses:
         | 
| 48 | 
            +
            - GPL-3.0-only
         | 
| 49 | 
            +
            metadata: {}
         | 
| 50 | 
            +
            post_install_message:
         | 
| 51 | 
            +
            rdoc_options: []
         | 
| 52 | 
            +
            require_paths:
         | 
| 53 | 
            +
            - lib
         | 
| 54 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 55 | 
            +
              requirements:
         | 
| 56 | 
            +
              - - ">="
         | 
| 57 | 
            +
                - !ruby/object:Gem::Version
         | 
| 58 | 
            +
                  version: '0'
         | 
| 59 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 60 | 
            +
              requirements:
         | 
| 61 | 
            +
              - - ">="
         | 
| 62 | 
            +
                - !ruby/object:Gem::Version
         | 
| 63 | 
            +
                  version: '0'
         | 
| 64 | 
            +
            requirements: []
         | 
| 65 | 
            +
            rubygems_version: 3.2.29
         | 
| 66 | 
            +
            signing_key:
         | 
| 67 | 
            +
            specification_version: 4
         | 
| 68 | 
            +
            summary: Simple source code lexer
         | 
| 69 | 
            +
            test_files: []
         |