code-lexer 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/code-lexer/abstractor.rb +110 -0
- data/lib/code-lexer/config.rb +38 -0
- data/lib/code-lexer/languages/javascript.clex +24 -0
- data/lib/code-lexer/lexer.rb +66 -0
- data/lib/code-lexer/token.rb +42 -0
- data/lib/code-lexer.rb +10 -0
- metadata +69 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d6c98649f07e77d4148fb744db9b79fb0ed714113d6aaee7cf02f249868070c9
|
4
|
+
data.tar.gz: 6c6deb5e8f6778a036dd60cf49d649192620455d0197358b4783e0df0dd91bce
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 60556343e374a1c7ea58a076473fa98e9a15f4b8a89451788675a2415eb7c2da05f9255463abe9a2ae75da239c84a6186f8d5bc7aa570dd16580bdd8e685a7e7
|
7
|
+
data.tar.gz: 974dcf39a0a41c496f61429dd0e5b16557a80be856fef450dddac29a19ca72dc4b4ed0c33a2ff9226a4c683af665b44d1b26c2b36ac49d15cd96d47cc350cd20
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require_relative 'token'
|
2
|
+
|
3
|
+
module CodeLexer
|
4
|
+
class Abstractor
|
5
|
+
attr_reader :dictionary
|
6
|
+
|
7
|
+
def initialize(dictionary=[])
|
8
|
+
@dictionary = ["NOOP"] + dictionary
|
9
|
+
end
|
10
|
+
|
11
|
+
def abstract_identifiers
|
12
|
+
@abstract_identifiers = true
|
13
|
+
return self
|
14
|
+
end
|
15
|
+
|
16
|
+
def abstract_numbers
|
17
|
+
@abstract_numbers = true
|
18
|
+
return self
|
19
|
+
end
|
20
|
+
|
21
|
+
def abstract_comments
|
22
|
+
@abstract_comments = true
|
23
|
+
return self
|
24
|
+
end
|
25
|
+
|
26
|
+
def abstract_strings
|
27
|
+
@abstract_strings = true
|
28
|
+
return self
|
29
|
+
end
|
30
|
+
|
31
|
+
def abstract_spaces
|
32
|
+
@abstract_spaces = true
|
33
|
+
return self
|
34
|
+
end
|
35
|
+
|
36
|
+
def remove_spaces
|
37
|
+
@remove_spaces = true
|
38
|
+
return self
|
39
|
+
end
|
40
|
+
|
41
|
+
def remove_newlines
|
42
|
+
@remove_newlines = true
|
43
|
+
return self
|
44
|
+
end
|
45
|
+
|
46
|
+
def remove_comments
|
47
|
+
@remove_comments = true
|
48
|
+
return self
|
49
|
+
end
|
50
|
+
|
51
|
+
def abstract!(tokens)
|
52
|
+
if @abstract_identifiers
|
53
|
+
identifier_tokens = tokens.select { |t| t.type == :identifier }
|
54
|
+
identifiers = identifier_tokens.map { |id| id.value }.uniq
|
55
|
+
|
56
|
+
identifiers.each do |id|
|
57
|
+
if @dictionary.include?(id)
|
58
|
+
abstracted_id = @dictionary.index(id)
|
59
|
+
else
|
60
|
+
abstracted_id = @dictionary.size
|
61
|
+
@dictionary << id
|
62
|
+
end
|
63
|
+
|
64
|
+
identifier_tokens.select { |t| t.value == id }.each do |matching_token|
|
65
|
+
matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
if @remove_comments
|
71
|
+
tokens.delete_if { |t| t.type == :comment }
|
72
|
+
elsif @abstract_comments
|
73
|
+
tokens.select { |t| t.type == :comment }.each do |comment_token|
|
74
|
+
comment_token.abstracted_value = Token.special("COMMENT")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
if @abstract_numbers
|
79
|
+
tokens.select { |t| t.type == :number }.each do |number_token|
|
80
|
+
number_token.abstracted_value = Token.special("NUMBER")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if @abstract_strings
|
85
|
+
tokens.select { |t| t.type == :string }.each do |string_token|
|
86
|
+
string_token.abstracted_value = Token.special("STRING")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
if @remove_newlines
|
91
|
+
tokens.delete_if { |t| t.type == :newline }
|
92
|
+
end
|
93
|
+
|
94
|
+
if @remove_spaces
|
95
|
+
tokens.delete_if { |t| t.type == :space }
|
96
|
+
elsif @abstract_spaces
|
97
|
+
tokens.select { |t| t.type == :space }.each do |space_token|
|
98
|
+
previous_index = tokens.index(space_token) - 1
|
99
|
+
if previous_index < 0 || tokens[previous_index].type == :newline
|
100
|
+
space_token.abstracted_value = Token.special("INDENTATION")
|
101
|
+
else
|
102
|
+
space_token.abstracted_value = Token.special("WHITESPACE")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
return self
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module CodeLexer
|
2
|
+
class Config
|
3
|
+
attr_reader :rules
|
4
|
+
def initialize(path)
|
5
|
+
@config = File.basename(path)
|
6
|
+
@rules = []
|
7
|
+
|
8
|
+
load_rules(File.read(path))
|
9
|
+
end
|
10
|
+
|
11
|
+
def matching_rule(text)
|
12
|
+
min_score = 10000
|
13
|
+
min_couple = []
|
14
|
+
@rules.each do |name, regex|
|
15
|
+
if (score = (text =~ regex))
|
16
|
+
if score < min_score
|
17
|
+
min_score = score
|
18
|
+
min_couple = [name, regex]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
return *min_couple
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def load_rules(content)
|
28
|
+
content.split("\n").each do |line|
|
29
|
+
name, regex = line.split(":", 2)
|
30
|
+
regex = Regexp.new("^" + regex)
|
31
|
+
|
32
|
+
@rules << [name.to_sym, regex]
|
33
|
+
end
|
34
|
+
|
35
|
+
@rules << [:other, /./]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
|
2
|
+
identifier:[$A-Za-z_][$A-Za-z0-9_]*
|
3
|
+
comment:\/\/[^.]*[\n\r]
|
4
|
+
comment:\/\/[^.]*$
|
5
|
+
comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
|
6
|
+
string:\"([^"]|\\\")*\"
|
7
|
+
string:\'[^']*\'
|
8
|
+
number:\-?[0-9]
|
9
|
+
number:\-?[1-9][0-9]*
|
10
|
+
number:\-?[0-9]*\.[0-9]
|
11
|
+
number:\-?[0-9]*\.[0-9]e\-?[0-9]+
|
12
|
+
number:\-?0[Xx][0-9A-Fa-f]+
|
13
|
+
number:\-?0[0-7]+
|
14
|
+
operator:(\<\=|\>\=|\=\=|\=\=\=|\!\=\=|\!\=)
|
15
|
+
operator:(\&\&|\||\|\||\!)
|
16
|
+
operator:(\=|\+\=|\-\=|\/\=|\*\=|\%\=)
|
17
|
+
operator:(\&|\||\~|\^|\<\<|\>\>)
|
18
|
+
operator:(\+|\-|\/|\*|\%|\+\+|\-\-)
|
19
|
+
operator:(\.|\,|\:)
|
20
|
+
operator:(\<|\>)
|
21
|
+
parenthesis:(\(|\)|\[|\]|\{|\})
|
22
|
+
semicolon:\;
|
23
|
+
newline:[\n\r]
|
24
|
+
space:\s+
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require_relative 'token'
|
2
|
+
require_relative 'abstractor'
|
3
|
+
require_relative 'config'
|
4
|
+
|
5
|
+
module CodeLexer
|
6
|
+
class Lexer
|
7
|
+
def initialize(config_path_or_config)
|
8
|
+
if config_path_or_config.is_a?(Config)
|
9
|
+
@config = config_path_or_config
|
10
|
+
else
|
11
|
+
@config = Config.new(config_path_or_config)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def lex(content)
|
16
|
+
content = content.clone
|
17
|
+
tokens = []
|
18
|
+
while content.length > 0
|
19
|
+
token_name, regex = @config.matching_rule(content)
|
20
|
+
content.sub!(regex) do |value|
|
21
|
+
tokens << Token.new(token_name, value)
|
22
|
+
""
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
return LexedContent.new(tokens)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class LexedContent
|
31
|
+
attr_reader :tokens
|
32
|
+
|
33
|
+
def initialize(tokens)
|
34
|
+
@tokens = tokens
|
35
|
+
end
|
36
|
+
|
37
|
+
def token_lines
|
38
|
+
result = []
|
39
|
+
current_line = []
|
40
|
+
@tokens.each do |t|
|
41
|
+
if t.type == :newline
|
42
|
+
result << current_line
|
43
|
+
current_line = []
|
44
|
+
else
|
45
|
+
current_line << t
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
result << current_line
|
50
|
+
result.delete_if { |line| line.empty? }
|
51
|
+
|
52
|
+
return result
|
53
|
+
end
|
54
|
+
|
55
|
+
def token_stream(abstractor = nil)
|
56
|
+
abstractor.abstract!(@tokens) if abstractor
|
57
|
+
|
58
|
+
result = []
|
59
|
+
@tokens.each do |token|
|
60
|
+
result << token.abstracted_value
|
61
|
+
end
|
62
|
+
|
63
|
+
return result.join(" ")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module CodeLexer
|
2
|
+
class Token
|
3
|
+
SPECIAL_TOKEN_OPEN = "¬"
|
4
|
+
SPECIAL_TOKEN_CLOSE = "¬"
|
5
|
+
|
6
|
+
def self.special(token)
|
7
|
+
"#{SPECIAL_TOKEN_OPEN}#{token}#{SPECIAL_TOKEN_CLOSE}"
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_accessor :type
|
11
|
+
attr_accessor :value
|
12
|
+
attr_accessor :abstracted_value
|
13
|
+
|
14
|
+
def initialize(type, value)
|
15
|
+
@type = type
|
16
|
+
self.value = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def value=(v)
|
20
|
+
@value = v
|
21
|
+
if @type == :newline
|
22
|
+
@abstracted_value = Token.special("NEWLINE")
|
23
|
+
elsif v =~ /\s/
|
24
|
+
@abstracted_value = Token.special(v.gsub(/\s/, "·"))
|
25
|
+
else
|
26
|
+
@abstracted_value = v
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
if @abstracted_value != @value
|
32
|
+
return "<#@type:#{@value.inspect}:#{@abstracted_value.inspect}>"
|
33
|
+
else
|
34
|
+
return "<#@type:#{@value.inspect}>"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def ==(oth)
|
39
|
+
@type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/code-lexer.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require_relative 'code-lexer/config'
|
2
|
+
require_relative 'code-lexer/abstractor'
|
3
|
+
require_relative 'code-lexer/lexer'
|
4
|
+
require_relative 'code-lexer/token'
|
5
|
+
|
6
|
+
module CodeLexer
|
7
|
+
def self.get(language)
|
8
|
+
return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
|
9
|
+
end
|
10
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: code-lexer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Scalabrino
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-11-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: code-assertions
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.1.2
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.1.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.2
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.1.2
|
33
|
+
description: Source code lexer configurable for any programming language that allows
|
34
|
+
to tokenize and abstract a given source file
|
35
|
+
email: s.scalabrino9@gmail.com
|
36
|
+
executables: []
|
37
|
+
extensions: []
|
38
|
+
extra_rdoc_files: []
|
39
|
+
files:
|
40
|
+
- lib/code-lexer.rb
|
41
|
+
- lib/code-lexer/abstractor.rb
|
42
|
+
- lib/code-lexer/config.rb
|
43
|
+
- lib/code-lexer/languages/javascript.clex
|
44
|
+
- lib/code-lexer/lexer.rb
|
45
|
+
- lib/code-lexer/token.rb
|
46
|
+
homepage: https://github.com/intersimone999/code-lexer
|
47
|
+
licenses:
|
48
|
+
- GPL-3.0-only
|
49
|
+
metadata: {}
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubygems_version: 3.2.29
|
66
|
+
signing_key:
|
67
|
+
specification_version: 4
|
68
|
+
summary: Simple source code lexer
|
69
|
+
test_files: []
|