code-lexer 0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d6c98649f07e77d4148fb744db9b79fb0ed714113d6aaee7cf02f249868070c9
4
+ data.tar.gz: 6c6deb5e8f6778a036dd60cf49d649192620455d0197358b4783e0df0dd91bce
5
+ SHA512:
6
+ metadata.gz: 60556343e374a1c7ea58a076473fa98e9a15f4b8a89451788675a2415eb7c2da05f9255463abe9a2ae75da239c84a6186f8d5bc7aa570dd16580bdd8e685a7e7
7
+ data.tar.gz: 974dcf39a0a41c496f61429dd0e5b16557a80be856fef450dddac29a19ca72dc4b4ed0c33a2ff9226a4c683af665b44d1b26c2b36ac49d15cd96d47cc350cd20
@@ -0,0 +1,110 @@
1
+ require_relative 'token'
2
+
3
+ module CodeLexer
4
+ class Abstractor
5
+ attr_reader :dictionary
6
+
7
+ def initialize(dictionary=[])
8
+ @dictionary = ["NOOP"] + dictionary
9
+ end
10
+
11
+ def abstract_identifiers
12
+ @abstract_identifiers = true
13
+ return self
14
+ end
15
+
16
+ def abstract_numbers
17
+ @abstract_numbers = true
18
+ return self
19
+ end
20
+
21
+ def abstract_comments
22
+ @abstract_comments = true
23
+ return self
24
+ end
25
+
26
+ def abstract_strings
27
+ @abstract_strings = true
28
+ return self
29
+ end
30
+
31
+ def abstract_spaces
32
+ @abstract_spaces = true
33
+ return self
34
+ end
35
+
36
+ def remove_spaces
37
+ @remove_spaces = true
38
+ return self
39
+ end
40
+
41
+ def remove_newlines
42
+ @remove_newlines = true
43
+ return self
44
+ end
45
+
46
+ def remove_comments
47
+ @remove_comments = true
48
+ return self
49
+ end
50
+
51
+ def abstract!(tokens)
52
+ if @abstract_identifiers
53
+ identifier_tokens = tokens.select { |t| t.type == :identifier }
54
+ identifiers = identifier_tokens.map { |id| id.value }.uniq
55
+
56
+ identifiers.each do |id|
57
+ if @dictionary.include?(id)
58
+ abstracted_id = @dictionary.index(id)
59
+ else
60
+ abstracted_id = @dictionary.size
61
+ @dictionary << id
62
+ end
63
+
64
+ identifier_tokens.select { |t| t.value == id }.each do |matching_token|
65
+ matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
66
+ end
67
+ end
68
+ end
69
+
70
+ if @remove_comments
71
+ tokens.delete_if { |t| t.type == :comment }
72
+ elsif @abstract_comments
73
+ tokens.select { |t| t.type == :comment }.each do |comment_token|
74
+ comment_token.abstracted_value = Token.special("COMMENT")
75
+ end
76
+ end
77
+
78
+ if @abstract_numbers
79
+ tokens.select { |t| t.type == :number }.each do |number_token|
80
+ number_token.abstracted_value = Token.special("NUMBER")
81
+ end
82
+ end
83
+
84
+ if @abstract_strings
85
+ tokens.select { |t| t.type == :string }.each do |string_token|
86
+ string_token.abstracted_value = Token.special("STRING")
87
+ end
88
+ end
89
+
90
+ if @remove_newlines
91
+ tokens.delete_if { |t| t.type == :newline }
92
+ end
93
+
94
+ if @remove_spaces
95
+ tokens.delete_if { |t| t.type == :space }
96
+ elsif @abstract_spaces
97
+ tokens.select { |t| t.type == :space }.each do |space_token|
98
+ previous_index = tokens.index(space_token) - 1
99
+ if previous_index < 0 || tokens[previous_index].type == :newline
100
+ space_token.abstracted_value = Token.special("INDENTATION")
101
+ else
102
+ space_token.abstracted_value = Token.special("WHITESPACE")
103
+ end
104
+ end
105
+ end
106
+
107
+ return self
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,38 @@
1
+ module CodeLexer
2
+ class Config
3
+ attr_reader :rules
4
+ def initialize(path)
5
+ @config = File.basename(path)
6
+ @rules = []
7
+
8
+ load_rules(File.read(path))
9
+ end
10
+
11
+ def matching_rule(text)
12
+ min_score = 10000
13
+ min_couple = []
14
+ @rules.each do |name, regex|
15
+ if (score = (text =~ regex))
16
+ if score < min_score
17
+ min_score = score
18
+ min_couple = [name, regex]
19
+ end
20
+ end
21
+ end
22
+
23
+ return *min_couple
24
+ end
25
+
26
+ private
27
+ def load_rules(content)
28
+ content.split("\n").each do |line|
29
+ name, regex = line.split(":", 2)
30
+ regex = Regexp.new("^" + regex)
31
+
32
+ @rules << [name.to_sym, regex]
33
+ end
34
+
35
+ @rules << [:other, /./]
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,24 @@
1
+ keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
2
+ identifier:[$A-Za-z_][$A-Za-z0-9_]*
3
+ comment:\/\/[^.]*[\n\r]
4
+ comment:\/\/[^.]*$
5
+ comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
6
+ string:\"([^"]|\\\")*\"
7
+ string:\'[^']*\'
8
+ number:\-?[0-9]
9
+ number:\-?[1-9][0-9]*
10
+ number:\-?[0-9]*\.[0-9]
11
+ number:\-?[0-9]*\.[0-9]e\-?[0-9]+
12
+ number:\-?0[Xx][0-9A-Fa-f]+
13
+ number:\-?0[0-7]+
14
+ operator:(\<\=|\>\=|\=\=|\=\=\=|\!\=\=|\!\=)
15
+ operator:(\&\&|\||\|\||\!)
16
+ operator:(\=|\+\=|\-\=|\/\=|\*\=|\%\=)
17
+ operator:(\&|\||\~|\^|\<\<|\>\>)
18
+ operator:(\+|\-|\/|\*|\%|\+\+|\-\-)
19
+ operator:(\.|\,|\:)
20
+ operator:(\<|\>)
21
+ parenthesis:(\(|\)|\[|\]|\{|\})
22
+ semicolon:\;
23
+ newline:[\n\r]
24
+ space:\s+
@@ -0,0 +1,66 @@
1
+ require_relative 'token'
2
+ require_relative 'abstractor'
3
+ require_relative 'config'
4
+
5
+ module CodeLexer
6
+ class Lexer
7
+ def initialize(config_path_or_config)
8
+ if config_path_or_config.is_a?(Config)
9
+ @config = config_path_or_config
10
+ else
11
+ @config = Config.new(config_path_or_config)
12
+ end
13
+ end
14
+
15
+ def lex(content)
16
+ content = content.clone
17
+ tokens = []
18
+ while content.length > 0
19
+ token_name, regex = @config.matching_rule(content)
20
+ content.sub!(regex) do |value|
21
+ tokens << Token.new(token_name, value)
22
+ ""
23
+ end
24
+ end
25
+
26
+ return LexedContent.new(tokens)
27
+ end
28
+ end
29
+
30
+ class LexedContent
31
+ attr_reader :tokens
32
+
33
+ def initialize(tokens)
34
+ @tokens = tokens
35
+ end
36
+
37
+ def token_lines
38
+ result = []
39
+ current_line = []
40
+ @tokens.each do |t|
41
+ if t.type == :newline
42
+ result << current_line
43
+ current_line = []
44
+ else
45
+ current_line << t
46
+ end
47
+ end
48
+
49
+ result << current_line
50
+ result.delete_if { |line| line.empty? }
51
+
52
+ return result
53
+ end
54
+
55
+ def token_stream(abstractor = nil)
56
+ abstractor.abstract!(@tokens) if abstractor
57
+
58
+ result = []
59
+ @tokens.each do |token|
60
+ result << token.abstracted_value
61
+ end
62
+
63
+ return result.join(" ")
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,42 @@
1
+ module CodeLexer
2
+ class Token
3
+ SPECIAL_TOKEN_OPEN = "¬"
4
+ SPECIAL_TOKEN_CLOSE = "¬"
5
+
6
+ def self.special(token)
7
+ "#{SPECIAL_TOKEN_OPEN}#{token}#{SPECIAL_TOKEN_CLOSE}"
8
+ end
9
+
10
+ attr_accessor :type
11
+ attr_accessor :value
12
+ attr_accessor :abstracted_value
13
+
14
+ def initialize(type, value)
15
+ @type = type
16
+ self.value = value
17
+ end
18
+
19
+ def value=(v)
20
+ @value = v
21
+ if @type == :newline
22
+ @abstracted_value = Token.special("NEWLINE")
23
+ elsif v =~ /\s/
24
+ @abstracted_value = Token.special(v.gsub(/\s/, "·"))
25
+ else
26
+ @abstracted_value = v
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ if @abstracted_value != @value
32
+ return "<#@type:#{@value.inspect}:#{@abstracted_value.inspect}>"
33
+ else
34
+ return "<#@type:#{@value.inspect}>"
35
+ end
36
+ end
37
+
38
+ def ==(oth)
39
+ @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
40
+ end
41
+ end
42
+ end
data/lib/code-lexer.rb ADDED
@@ -0,0 +1,10 @@
1
+ require_relative 'code-lexer/config'
2
+ require_relative 'code-lexer/abstractor'
3
+ require_relative 'code-lexer/lexer'
4
+ require_relative 'code-lexer/token'
5
+
6
+ module CodeLexer
7
+ def self.get(language)
8
+ return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
9
+ end
10
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: code-lexer
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Simone Scalabrino
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-11-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: code-assertions
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.1.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.1.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.2
33
+ description: Source code lexer configurable for any programming language that allows
34
+ to tokenize and abstract a given source file
35
+ email: s.scalabrino9@gmail.com
36
+ executables: []
37
+ extensions: []
38
+ extra_rdoc_files: []
39
+ files:
40
+ - lib/code-lexer.rb
41
+ - lib/code-lexer/abstractor.rb
42
+ - lib/code-lexer/config.rb
43
+ - lib/code-lexer/languages/javascript.clex
44
+ - lib/code-lexer/lexer.rb
45
+ - lib/code-lexer/token.rb
46
+ homepage: https://github.com/intersimone999/code-lexer
47
+ licenses:
48
+ - GPL-3.0-only
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubygems_version: 3.2.29
66
+ signing_key:
67
+ specification_version: 4
68
+ summary: Simple source code lexer
69
+ test_files: []