code-lexer 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d6c98649f07e77d4148fb744db9b79fb0ed714113d6aaee7cf02f249868070c9
4
+ data.tar.gz: 6c6deb5e8f6778a036dd60cf49d649192620455d0197358b4783e0df0dd91bce
5
+ SHA512:
6
+ metadata.gz: 60556343e374a1c7ea58a076473fa98e9a15f4b8a89451788675a2415eb7c2da05f9255463abe9a2ae75da239c84a6186f8d5bc7aa570dd16580bdd8e685a7e7
7
+ data.tar.gz: 974dcf39a0a41c496f61429dd0e5b16557a80be856fef450dddac29a19ca72dc4b4ed0c33a2ff9226a4c683af665b44d1b26c2b36ac49d15cd96d47cc350cd20
@@ -0,0 +1,110 @@
1
+ require_relative 'token'
2
+
3
+ module CodeLexer
4
+ class Abstractor
5
+ attr_reader :dictionary
6
+
7
+ def initialize(dictionary=[])
8
+ @dictionary = ["NOOP"] + dictionary
9
+ end
10
+
11
+ def abstract_identifiers
12
+ @abstract_identifiers = true
13
+ return self
14
+ end
15
+
16
+ def abstract_numbers
17
+ @abstract_numbers = true
18
+ return self
19
+ end
20
+
21
+ def abstract_comments
22
+ @abstract_comments = true
23
+ return self
24
+ end
25
+
26
+ def abstract_strings
27
+ @abstract_strings = true
28
+ return self
29
+ end
30
+
31
+ def abstract_spaces
32
+ @abstract_spaces = true
33
+ return self
34
+ end
35
+
36
+ def remove_spaces
37
+ @remove_spaces = true
38
+ return self
39
+ end
40
+
41
+ def remove_newlines
42
+ @remove_newlines = true
43
+ return self
44
+ end
45
+
46
+ def remove_comments
47
+ @remove_comments = true
48
+ return self
49
+ end
50
+
51
+ def abstract!(tokens)
52
+ if @abstract_identifiers
53
+ identifier_tokens = tokens.select { |t| t.type == :identifier }
54
+ identifiers = identifier_tokens.map { |id| id.value }.uniq
55
+
56
+ identifiers.each do |id|
57
+ if @dictionary.include?(id)
58
+ abstracted_id = @dictionary.index(id)
59
+ else
60
+ abstracted_id = @dictionary.size
61
+ @dictionary << id
62
+ end
63
+
64
+ identifier_tokens.select { |t| t.value == id }.each do |matching_token|
65
+ matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
66
+ end
67
+ end
68
+ end
69
+
70
+ if @remove_comments
71
+ tokens.delete_if { |t| t.type == :comment }
72
+ elsif @abstract_comments
73
+ tokens.select { |t| t.type == :comment }.each do |comment_token|
74
+ comment_token.abstracted_value = Token.special("COMMENT")
75
+ end
76
+ end
77
+
78
+ if @abstract_numbers
79
+ tokens.select { |t| t.type == :number }.each do |number_token|
80
+ number_token.abstracted_value = Token.special("NUMBER")
81
+ end
82
+ end
83
+
84
+ if @abstract_strings
85
+ tokens.select { |t| t.type == :string }.each do |string_token|
86
+ string_token.abstracted_value = Token.special("STRING")
87
+ end
88
+ end
89
+
90
+ if @remove_newlines
91
+ tokens.delete_if { |t| t.type == :newline }
92
+ end
93
+
94
+ if @remove_spaces
95
+ tokens.delete_if { |t| t.type == :space }
96
+ elsif @abstract_spaces
97
+ tokens.select { |t| t.type == :space }.each do |space_token|
98
+ previous_index = tokens.index(space_token) - 1
99
+ if previous_index < 0 || tokens[previous_index].type == :newline
100
+ space_token.abstracted_value = Token.special("INDENTATION")
101
+ else
102
+ space_token.abstracted_value = Token.special("WHITESPACE")
103
+ end
104
+ end
105
+ end
106
+
107
+ return self
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,38 @@
1
+ module CodeLexer
2
+ class Config
3
+ attr_reader :rules
4
+ def initialize(path)
5
+ @config = File.basename(path)
6
+ @rules = []
7
+
8
+ load_rules(File.read(path))
9
+ end
10
+
11
+ def matching_rule(text)
12
+ min_score = 10000
13
+ min_couple = []
14
+ @rules.each do |name, regex|
15
+ if (score = (text =~ regex))
16
+ if score < min_score
17
+ min_score = score
18
+ min_couple = [name, regex]
19
+ end
20
+ end
21
+ end
22
+
23
+ return *min_couple
24
+ end
25
+
26
+ private
27
+ def load_rules(content)
28
+ content.split("\n").each do |line|
29
+ name, regex = line.split(":", 2)
30
+ regex = Regexp.new("^" + regex)
31
+
32
+ @rules << [name.to_sym, regex]
33
+ end
34
+
35
+ @rules << [:other, /./]
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,24 @@
1
+ keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
2
+ identifier:[$A-Za-z_][$A-Za-z0-9_]*
3
+ comment:\/\/[^.]*[\n\r]
4
+ comment:\/\/[^.]*$
5
+ comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
6
+ string:\"([^"]|\\\")*\"
7
+ string:\'[^']*\'
8
+ number:\-?[0-9]
9
+ number:\-?[1-9][0-9]*
10
+ number:\-?[0-9]*\.[0-9]
11
+ number:\-?[0-9]*\.[0-9]e\-?[0-9]+
12
+ number:\-?0[Xx][0-9A-Fa-f]+
13
+ number:\-?0[0-7]+
14
+ operator:(\<\=|\>\=|\=\=|\=\=\=|\!\=\=|\!\=)
15
+ operator:(\&\&|\||\|\||\!)
16
+ operator:(\=|\+\=|\-\=|\/\=|\*\=|\%\=)
17
+ operator:(\&|\||\~|\^|\<\<|\>\>)
18
+ operator:(\+|\-|\/|\*|\%|\+\+|\-\-)
19
+ operator:(\.|\,|\:)
20
+ operator:(\<|\>)
21
+ parenthesis:(\(|\)|\[|\]|\{|\})
22
+ semicolon:\;
23
+ newline:[\n\r]
24
+ space:\s+
@@ -0,0 +1,66 @@
1
+ require_relative 'token'
2
+ require_relative 'abstractor'
3
+ require_relative 'config'
4
+
5
+ module CodeLexer
6
+ class Lexer
7
+ def initialize(config_path_or_config)
8
+ if config_path_or_config.is_a?(Config)
9
+ @config = config_path_or_config
10
+ else
11
+ @config = Config.new(config_path_or_config)
12
+ end
13
+ end
14
+
15
+ def lex(content)
16
+ content = content.clone
17
+ tokens = []
18
+ while content.length > 0
19
+ token_name, regex = @config.matching_rule(content)
20
+ content.sub!(regex) do |value|
21
+ tokens << Token.new(token_name, value)
22
+ ""
23
+ end
24
+ end
25
+
26
+ return LexedContent.new(tokens)
27
+ end
28
+ end
29
+
30
+ class LexedContent
31
+ attr_reader :tokens
32
+
33
+ def initialize(tokens)
34
+ @tokens = tokens
35
+ end
36
+
37
+ def token_lines
38
+ result = []
39
+ current_line = []
40
+ @tokens.each do |t|
41
+ if t.type == :newline
42
+ result << current_line
43
+ current_line = []
44
+ else
45
+ current_line << t
46
+ end
47
+ end
48
+
49
+ result << current_line
50
+ result.delete_if { |line| line.empty? }
51
+
52
+ return result
53
+ end
54
+
55
+ def token_stream(abstractor = nil)
56
+ abstractor.abstract!(@tokens) if abstractor
57
+
58
+ result = []
59
+ @tokens.each do |token|
60
+ result << token.abstracted_value
61
+ end
62
+
63
+ return result.join(" ")
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,42 @@
1
+ module CodeLexer
2
+ class Token
3
+ SPECIAL_TOKEN_OPEN = "¬"
4
+ SPECIAL_TOKEN_CLOSE = "¬"
5
+
6
+ def self.special(token)
7
+ "#{SPECIAL_TOKEN_OPEN}#{token}#{SPECIAL_TOKEN_CLOSE}"
8
+ end
9
+
10
+ attr_accessor :type
11
+ attr_accessor :value
12
+ attr_accessor :abstracted_value
13
+
14
+ def initialize(type, value)
15
+ @type = type
16
+ self.value = value
17
+ end
18
+
19
+ def value=(v)
20
+ @value = v
21
+ if @type == :newline
22
+ @abstracted_value = Token.special("NEWLINE")
23
+ elsif v =~ /\s/
24
+ @abstracted_value = Token.special(v.gsub(/\s/, "·"))
25
+ else
26
+ @abstracted_value = v
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ if @abstracted_value != @value
32
+ return "<#@type:#{@value.inspect}:#{@abstracted_value.inspect}>"
33
+ else
34
+ return "<#@type:#{@value.inspect}>"
35
+ end
36
+ end
37
+
38
+ def ==(oth)
39
+ @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
40
+ end
41
+ end
42
+ end
data/lib/code-lexer.rb ADDED
@@ -0,0 +1,10 @@
1
+ require_relative 'code-lexer/config'
2
+ require_relative 'code-lexer/abstractor'
3
+ require_relative 'code-lexer/lexer'
4
+ require_relative 'code-lexer/token'
5
+
6
+ module CodeLexer
7
+ def self.get(language)
8
+ return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
9
+ end
10
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: code-lexer
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - Simone Scalabrino
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-11-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: code-assertions
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.1.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.1.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.1.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.2
33
+ description: Source code lexer configurable for any programming language that allows
34
+ to tokenize and abstract a given source file
35
+ email: s.scalabrino9@gmail.com
36
+ executables: []
37
+ extensions: []
38
+ extra_rdoc_files: []
39
+ files:
40
+ - lib/code-lexer.rb
41
+ - lib/code-lexer/abstractor.rb
42
+ - lib/code-lexer/config.rb
43
+ - lib/code-lexer/languages/javascript.clex
44
+ - lib/code-lexer/lexer.rb
45
+ - lib/code-lexer/token.rb
46
+ homepage: https://github.com/intersimone999/code-lexer
47
+ licenses:
48
+ - GPL-3.0-only
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubygems_version: 3.2.29
66
+ signing_key:
67
+ specification_version: 4
68
+ summary: Simple source code lexer
69
+ test_files: []