code-lexer 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/code-lexer/abstractor.rb +110 -0
- data/lib/code-lexer/config.rb +38 -0
- data/lib/code-lexer/languages/javascript.clex +24 -0
- data/lib/code-lexer/lexer.rb +66 -0
- data/lib/code-lexer/token.rb +42 -0
- data/lib/code-lexer.rb +10 -0
- metadata +69 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d6c98649f07e77d4148fb744db9b79fb0ed714113d6aaee7cf02f249868070c9
|
4
|
+
data.tar.gz: 6c6deb5e8f6778a036dd60cf49d649192620455d0197358b4783e0df0dd91bce
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 60556343e374a1c7ea58a076473fa98e9a15f4b8a89451788675a2415eb7c2da05f9255463abe9a2ae75da239c84a6186f8d5bc7aa570dd16580bdd8e685a7e7
|
7
|
+
data.tar.gz: 974dcf39a0a41c496f61429dd0e5b16557a80be856fef450dddac29a19ca72dc4b4ed0c33a2ff9226a4c683af665b44d1b26c2b36ac49d15cd96d47cc350cd20
|
@@ -0,0 +1,110 @@
|
|
1
|
+
require_relative 'token'
|
2
|
+
|
3
|
+
module CodeLexer
|
4
|
+
class Abstractor
|
5
|
+
attr_reader :dictionary
|
6
|
+
|
7
|
+
def initialize(dictionary=[])
|
8
|
+
@dictionary = ["NOOP"] + dictionary
|
9
|
+
end
|
10
|
+
|
11
|
+
def abstract_identifiers
|
12
|
+
@abstract_identifiers = true
|
13
|
+
return self
|
14
|
+
end
|
15
|
+
|
16
|
+
def abstract_numbers
|
17
|
+
@abstract_numbers = true
|
18
|
+
return self
|
19
|
+
end
|
20
|
+
|
21
|
+
def abstract_comments
|
22
|
+
@abstract_comments = true
|
23
|
+
return self
|
24
|
+
end
|
25
|
+
|
26
|
+
def abstract_strings
|
27
|
+
@abstract_strings = true
|
28
|
+
return self
|
29
|
+
end
|
30
|
+
|
31
|
+
def abstract_spaces
|
32
|
+
@abstract_spaces = true
|
33
|
+
return self
|
34
|
+
end
|
35
|
+
|
36
|
+
def remove_spaces
|
37
|
+
@remove_spaces = true
|
38
|
+
return self
|
39
|
+
end
|
40
|
+
|
41
|
+
def remove_newlines
|
42
|
+
@remove_newlines = true
|
43
|
+
return self
|
44
|
+
end
|
45
|
+
|
46
|
+
def remove_comments
|
47
|
+
@remove_comments = true
|
48
|
+
return self
|
49
|
+
end
|
50
|
+
|
51
|
+
def abstract!(tokens)
|
52
|
+
if @abstract_identifiers
|
53
|
+
identifier_tokens = tokens.select { |t| t.type == :identifier }
|
54
|
+
identifiers = identifier_tokens.map { |id| id.value }.uniq
|
55
|
+
|
56
|
+
identifiers.each do |id|
|
57
|
+
if @dictionary.include?(id)
|
58
|
+
abstracted_id = @dictionary.index(id)
|
59
|
+
else
|
60
|
+
abstracted_id = @dictionary.size
|
61
|
+
@dictionary << id
|
62
|
+
end
|
63
|
+
|
64
|
+
identifier_tokens.select { |t| t.value == id }.each do |matching_token|
|
65
|
+
matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
if @remove_comments
|
71
|
+
tokens.delete_if { |t| t.type == :comment }
|
72
|
+
elsif @abstract_comments
|
73
|
+
tokens.select { |t| t.type == :comment }.each do |comment_token|
|
74
|
+
comment_token.abstracted_value = Token.special("COMMENT")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
if @abstract_numbers
|
79
|
+
tokens.select { |t| t.type == :number }.each do |number_token|
|
80
|
+
number_token.abstracted_value = Token.special("NUMBER")
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
if @abstract_strings
|
85
|
+
tokens.select { |t| t.type == :string }.each do |string_token|
|
86
|
+
string_token.abstracted_value = Token.special("STRING")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
if @remove_newlines
|
91
|
+
tokens.delete_if { |t| t.type == :newline }
|
92
|
+
end
|
93
|
+
|
94
|
+
if @remove_spaces
|
95
|
+
tokens.delete_if { |t| t.type == :space }
|
96
|
+
elsif @abstract_spaces
|
97
|
+
tokens.select { |t| t.type == :space }.each do |space_token|
|
98
|
+
previous_index = tokens.index(space_token) - 1
|
99
|
+
if previous_index < 0 || tokens[previous_index].type == :newline
|
100
|
+
space_token.abstracted_value = Token.special("INDENTATION")
|
101
|
+
else
|
102
|
+
space_token.abstracted_value = Token.special("WHITESPACE")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
return self
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module CodeLexer
|
2
|
+
class Config
|
3
|
+
attr_reader :rules
|
4
|
+
def initialize(path)
|
5
|
+
@config = File.basename(path)
|
6
|
+
@rules = []
|
7
|
+
|
8
|
+
load_rules(File.read(path))
|
9
|
+
end
|
10
|
+
|
11
|
+
def matching_rule(text)
|
12
|
+
min_score = 10000
|
13
|
+
min_couple = []
|
14
|
+
@rules.each do |name, regex|
|
15
|
+
if (score = (text =~ regex))
|
16
|
+
if score < min_score
|
17
|
+
min_score = score
|
18
|
+
min_couple = [name, regex]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
return *min_couple
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def load_rules(content)
|
28
|
+
content.split("\n").each do |line|
|
29
|
+
name, regex = line.split(":", 2)
|
30
|
+
regex = Regexp.new("^" + regex)
|
31
|
+
|
32
|
+
@rules << [name.to_sym, regex]
|
33
|
+
end
|
34
|
+
|
35
|
+
@rules << [:other, /./]
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
|
2
|
+
identifier:[$A-Za-z_][$A-Za-z0-9_]*
|
3
|
+
comment:\/\/[^.]*[\n\r]
|
4
|
+
comment:\/\/[^.]*$
|
5
|
+
comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
|
6
|
+
string:\"([^"]|\\\")*\"
|
7
|
+
string:\'[^']*\'
|
8
|
+
number:\-?[0-9]
|
9
|
+
number:\-?[1-9][0-9]*
|
10
|
+
number:\-?[0-9]*\.[0-9]
|
11
|
+
number:\-?[0-9]*\.[0-9]e\-?[0-9]+
|
12
|
+
number:\-?0[Xx][0-9A-Fa-f]+
|
13
|
+
number:\-?0[0-7]+
|
14
|
+
operator:(\<\=|\>\=|\=\=|\=\=\=|\!\=\=|\!\=)
|
15
|
+
operator:(\&\&|\||\|\||\!)
|
16
|
+
operator:(\=|\+\=|\-\=|\/\=|\*\=|\%\=)
|
17
|
+
operator:(\&|\||\~|\^|\<\<|\>\>)
|
18
|
+
operator:(\+|\-|\/|\*|\%|\+\+|\-\-)
|
19
|
+
operator:(\.|\,|\:)
|
20
|
+
operator:(\<|\>)
|
21
|
+
parenthesis:(\(|\)|\[|\]|\{|\})
|
22
|
+
semicolon:\;
|
23
|
+
newline:[\n\r]
|
24
|
+
space:\s+
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require_relative 'token'
|
2
|
+
require_relative 'abstractor'
|
3
|
+
require_relative 'config'
|
4
|
+
|
5
|
+
module CodeLexer
|
6
|
+
class Lexer
|
7
|
+
def initialize(config_path_or_config)
|
8
|
+
if config_path_or_config.is_a?(Config)
|
9
|
+
@config = config_path_or_config
|
10
|
+
else
|
11
|
+
@config = Config.new(config_path_or_config)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def lex(content)
|
16
|
+
content = content.clone
|
17
|
+
tokens = []
|
18
|
+
while content.length > 0
|
19
|
+
token_name, regex = @config.matching_rule(content)
|
20
|
+
content.sub!(regex) do |value|
|
21
|
+
tokens << Token.new(token_name, value)
|
22
|
+
""
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
return LexedContent.new(tokens)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class LexedContent
|
31
|
+
attr_reader :tokens
|
32
|
+
|
33
|
+
def initialize(tokens)
|
34
|
+
@tokens = tokens
|
35
|
+
end
|
36
|
+
|
37
|
+
def token_lines
|
38
|
+
result = []
|
39
|
+
current_line = []
|
40
|
+
@tokens.each do |t|
|
41
|
+
if t.type == :newline
|
42
|
+
result << current_line
|
43
|
+
current_line = []
|
44
|
+
else
|
45
|
+
current_line << t
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
result << current_line
|
50
|
+
result.delete_if { |line| line.empty? }
|
51
|
+
|
52
|
+
return result
|
53
|
+
end
|
54
|
+
|
55
|
+
def token_stream(abstractor = nil)
|
56
|
+
abstractor.abstract!(@tokens) if abstractor
|
57
|
+
|
58
|
+
result = []
|
59
|
+
@tokens.each do |token|
|
60
|
+
result << token.abstracted_value
|
61
|
+
end
|
62
|
+
|
63
|
+
return result.join(" ")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module CodeLexer
|
2
|
+
class Token
|
3
|
+
SPECIAL_TOKEN_OPEN = "¬"
|
4
|
+
SPECIAL_TOKEN_CLOSE = "¬"
|
5
|
+
|
6
|
+
def self.special(token)
|
7
|
+
"#{SPECIAL_TOKEN_OPEN}#{token}#{SPECIAL_TOKEN_CLOSE}"
|
8
|
+
end
|
9
|
+
|
10
|
+
attr_accessor :type
|
11
|
+
attr_accessor :value
|
12
|
+
attr_accessor :abstracted_value
|
13
|
+
|
14
|
+
def initialize(type, value)
|
15
|
+
@type = type
|
16
|
+
self.value = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def value=(v)
|
20
|
+
@value = v
|
21
|
+
if @type == :newline
|
22
|
+
@abstracted_value = Token.special("NEWLINE")
|
23
|
+
elsif v =~ /\s/
|
24
|
+
@abstracted_value = Token.special(v.gsub(/\s/, "·"))
|
25
|
+
else
|
26
|
+
@abstracted_value = v
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
if @abstracted_value != @value
|
32
|
+
return "<#@type:#{@value.inspect}:#{@abstracted_value.inspect}>"
|
33
|
+
else
|
34
|
+
return "<#@type:#{@value.inspect}>"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def ==(oth)
|
39
|
+
@type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/code-lexer.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require_relative 'code-lexer/config'
|
2
|
+
require_relative 'code-lexer/abstractor'
|
3
|
+
require_relative 'code-lexer/lexer'
|
4
|
+
require_relative 'code-lexer/token'
|
5
|
+
|
6
|
+
module CodeLexer
|
7
|
+
def self.get(language)
|
8
|
+
return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
|
9
|
+
end
|
10
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: code-lexer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Scalabrino
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-11-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: code-assertions
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.1.2
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.1.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.1.2
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.1.2
|
33
|
+
description: Source code lexer configurable for any programming language that allows
|
34
|
+
to tokenize and abstract a given source file
|
35
|
+
email: s.scalabrino9@gmail.com
|
36
|
+
executables: []
|
37
|
+
extensions: []
|
38
|
+
extra_rdoc_files: []
|
39
|
+
files:
|
40
|
+
- lib/code-lexer.rb
|
41
|
+
- lib/code-lexer/abstractor.rb
|
42
|
+
- lib/code-lexer/config.rb
|
43
|
+
- lib/code-lexer/languages/javascript.clex
|
44
|
+
- lib/code-lexer/lexer.rb
|
45
|
+
- lib/code-lexer/token.rb
|
46
|
+
homepage: https://github.com/intersimone999/code-lexer
|
47
|
+
licenses:
|
48
|
+
- GPL-3.0-only
|
49
|
+
metadata: {}
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubygems_version: 3.2.29
|
66
|
+
signing_key:
|
67
|
+
specification_version: 4
|
68
|
+
summary: Simple source code lexer
|
69
|
+
test_files: []
|