code-lexer 0.6 → 0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/code-lexer/abstractor.rb +195 -55
- data/lib/code-lexer/config.rb +1 -2
- data/lib/code-lexer/languages/javascript.yml +5 -3
- data/lib/code-lexer/lexer.rb +80 -7
- data/lib/code-lexer/token.rb +24 -7
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 63422a570f2d8b96e95ee2dfce7f67b797df7aabeca3b1a92c67535d497bd5f7
|
4
|
+
data.tar.gz: 6b2abf7eaf8cc3518de8998b67be1fd4823c4a59db2c879aec04561e6453ac3e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 27a3cdf2d95c3e832c48988441e1f5eb466ec595399cbf93676fccaf6b5c6edfb58337782742ebcc1bae6b8052ab05ae3d61e01ad297706d00298a4b22160ff2
|
7
|
+
data.tar.gz: 9b2e9fac2f678751f11018a13589b692871a86b4ab652524c3af9f5c042266f4398ee902e9310b77f5ded49ae8357503d99bf6740ec42de914f1da283817aa75
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require_relative 'token'
|
2
2
|
|
3
3
|
module CodeLexer
|
4
|
-
class Abstractor
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
@dictionary
|
4
|
+
class Abstractor
|
5
|
+
def initialize(identifiers_dictionary = [], strings_dictionary = [], numbers_dictionary = [])
|
6
|
+
@dictionary = {}
|
7
|
+
@dictionary[:identifiers] = ['NOOP'] + identifiers_dictionary
|
8
|
+
@dictionary[:strings] = strings_dictionary
|
9
|
+
@dictionary[:numbers] = numbers_dictionary
|
10
|
+
|
11
|
+
@abstractor_pieces = []
|
9
12
|
end
|
10
13
|
|
11
14
|
def abstract_everything
|
@@ -18,103 +21,240 @@ module CodeLexer
|
|
18
21
|
return self
|
19
22
|
end
|
20
23
|
|
24
|
+
def dictionary
|
25
|
+
warn "[DEPRECATION] The method CodeLexer::Abstractor#dictionary is deprecated; used CodeLexer::Abstractor#identifiers_dictionary instead"
|
26
|
+
self.identifiers_dictionary
|
27
|
+
end
|
28
|
+
|
29
|
+
def identifiers_dictionary
|
30
|
+
@dictionary[:identifiers]
|
31
|
+
end
|
32
|
+
|
33
|
+
def strings_dictionary
|
34
|
+
@dictionary[:strings]
|
35
|
+
end
|
36
|
+
|
37
|
+
def numbers_dictionary
|
38
|
+
@dictionary[:numbers]
|
39
|
+
end
|
40
|
+
|
41
|
+
def dictionaries
|
42
|
+
@dictionary
|
43
|
+
end
|
44
|
+
|
21
45
|
def abstract_identifiers
|
22
|
-
@
|
46
|
+
@abstractor_pieces << IdentifierAbstractor.new(self)
|
23
47
|
return self
|
24
48
|
end
|
25
49
|
|
26
50
|
def abstract_numbers
|
27
|
-
@
|
51
|
+
@abstractor_pieces << NumberAbstractor.new(self)
|
28
52
|
return self
|
29
53
|
end
|
30
54
|
|
31
55
|
def abstract_comments
|
32
|
-
@
|
56
|
+
@abstractor_pieces << CommentAbstractor.new(self)
|
33
57
|
return self
|
34
58
|
end
|
35
59
|
|
36
60
|
def abstract_strings
|
37
|
-
@
|
61
|
+
@abstractor_pieces << StringAbstractor.new(self)
|
38
62
|
return self
|
39
63
|
end
|
40
64
|
|
41
65
|
def abstract_spaces
|
42
|
-
@
|
66
|
+
@abstractor_pieces << SpaceAbstractor.new(self)
|
43
67
|
return self
|
44
68
|
end
|
45
69
|
|
46
70
|
def remove_spaces
|
47
|
-
@
|
71
|
+
@abstractor_pieces << SpaceRemover.new(self)
|
48
72
|
return self
|
49
73
|
end
|
50
74
|
|
51
75
|
def remove_newlines
|
52
|
-
@
|
76
|
+
@abstractor_pieces << NewlineRemover.new(self)
|
53
77
|
return self
|
54
78
|
end
|
55
79
|
|
56
80
|
def remove_comments
|
57
|
-
@
|
81
|
+
@abstractor_pieces << CommentRemover.new(self)
|
58
82
|
return self
|
59
83
|
end
|
60
84
|
|
61
85
|
def abstract!(tokens)
|
62
|
-
|
63
|
-
|
64
|
-
identifiers = identifier_tokens.map { |id| id.value }.uniq
|
65
|
-
|
66
|
-
identifiers.each do |id|
|
67
|
-
if @dictionary.include?(id)
|
68
|
-
abstracted_id = @dictionary.index(id)
|
69
|
-
else
|
70
|
-
abstracted_id = @dictionary.size
|
71
|
-
@dictionary << id
|
72
|
-
end
|
73
|
-
|
74
|
-
identifier_tokens.select { |t| t.value == id }.each do |matching_token|
|
75
|
-
matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
|
76
|
-
end
|
77
|
-
end
|
86
|
+
@abstractor_pieces.each do |abstractor_piece|
|
87
|
+
tokens = abstractor_piece.abstract(tokens)
|
78
88
|
end
|
79
89
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
90
|
+
return self
|
91
|
+
end
|
92
|
+
|
93
|
+
def deabstract!(tokens)
|
94
|
+
@abstractor_pieces.each do |abstractor_piece|
|
95
|
+
tokens = abstractor_piece.deabstract(tokens)
|
86
96
|
end
|
87
97
|
|
88
|
-
|
89
|
-
|
90
|
-
|
98
|
+
return self
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class AbstractorPiece
|
103
|
+
def initialize(abstractor)
|
104
|
+
@abstractor = abstractor
|
105
|
+
end
|
106
|
+
|
107
|
+
def abstract(tokens)
|
108
|
+
return tokens
|
109
|
+
end
|
110
|
+
|
111
|
+
def deabstract(tokens)
|
112
|
+
return tokens
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
class IdentifierAbstractor < AbstractorPiece
|
117
|
+
def abstract(tokens)
|
118
|
+
identifier_tokens = tokens.select { |t| t.type == :identifier }
|
119
|
+
identifiers = identifier_tokens.map { |id| id.value }.uniq
|
120
|
+
|
121
|
+
identifiers.each do |id|
|
122
|
+
if @abstractor.identifiers_dictionary.include?(id)
|
123
|
+
abstracted_id = @abstractor.identifiers_dictionary.index(id)
|
124
|
+
else
|
125
|
+
abstracted_id = @abstractor.identifiers_dictionary.size
|
126
|
+
@abstractor.identifiers_dictionary << id
|
127
|
+
end
|
128
|
+
|
129
|
+
identifier_tokens.select { |t| t.value == id }.each do |matching_token|
|
130
|
+
matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
|
91
131
|
end
|
92
132
|
end
|
93
133
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
134
|
+
return tokens
|
135
|
+
end
|
136
|
+
|
137
|
+
def deabstract(tokens)
|
138
|
+
tokens.select { |t| t.abstracted_value.match?(/.ID[0-9]+./) }.each do |token|
|
139
|
+
id = token.abstracted_value.scan(/.ID([0-9]+)./).flatten[0].to_i
|
140
|
+
|
141
|
+
token.type = :identifier
|
142
|
+
token.value = @abstractor.identifiers_dictionary[id]
|
98
143
|
end
|
99
144
|
|
100
|
-
|
101
|
-
|
145
|
+
return tokens
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class NumberAbstractor < AbstractorPiece
|
150
|
+
def abstract(tokens)
|
151
|
+
tokens.select { |t| t.type == :number }.each do |number_token|
|
152
|
+
number_token.abstracted_value = Token.special("NUMBER")
|
153
|
+
@abstractor.numbers_dictionary << number_token.value
|
102
154
|
end
|
103
155
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
156
|
+
return tokens
|
157
|
+
end
|
158
|
+
|
159
|
+
def deabstract(tokens)
|
160
|
+
id = 0
|
161
|
+
tokens.select { |t| t.abstracted_value == Token.special("NUMBER") }.each do |token|
|
162
|
+
token.type = :number
|
163
|
+
token.value = @abstractor.numbers_dictionary[id]
|
164
|
+
|
165
|
+
id += 1
|
166
|
+
end
|
167
|
+
|
168
|
+
return tokens
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
class StringAbstractor < AbstractorPiece
|
173
|
+
def abstract(tokens)
|
174
|
+
tokens.select { |t| t.type == :string }.each do |string_token|
|
175
|
+
string_token.abstracted_value = Token.special("STRING")
|
176
|
+
@abstractor.strings_dictionary << string_token.value
|
177
|
+
end
|
178
|
+
|
179
|
+
return tokens
|
180
|
+
end
|
181
|
+
|
182
|
+
def deabstract(tokens)
|
183
|
+
id = 0
|
184
|
+
tokens.select { |t| t.abstracted_value == Token.special("STRING") }.each do |token|
|
185
|
+
token.type = :string
|
186
|
+
token.value = '"' + @abstractor.strings_dictionary[id] + '"'
|
187
|
+
|
188
|
+
id += 1
|
189
|
+
end
|
190
|
+
|
191
|
+
return tokens
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
class CommentAbstractor < AbstractorPiece
|
196
|
+
def abstract(tokens)
|
197
|
+
tokens.select { |t| t.type == :comment }.each do |comment_token|
|
198
|
+
comment_token.abstracted_value = Token.special("COMMENT")
|
199
|
+
end
|
200
|
+
return tokens
|
201
|
+
end
|
202
|
+
|
203
|
+
def deabstract(tokens)
|
204
|
+
tokens.select { |t| t.abstracted_value == Token.special("COMMENT") }.each do |token|
|
205
|
+
token.type = :comment
|
206
|
+
token.value = 'Unknown comment'
|
207
|
+
end
|
208
|
+
|
209
|
+
return tokens
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
class SpaceAbstractor < AbstractorPiece
|
214
|
+
def abstract(tokens)
|
215
|
+
tokens.select { |t| t.type == :space }.each do |space_token|
|
216
|
+
previous_index = tokens.index(space_token) - 1
|
217
|
+
if previous_index < 0 || tokens[previous_index].type == :newline
|
218
|
+
space_token.abstracted_value = Token.special("INDENTATION")
|
219
|
+
else
|
220
|
+
space_token.abstracted_value = Token.special("WHITESPACE")
|
114
221
|
end
|
115
222
|
end
|
116
223
|
|
117
|
-
return
|
224
|
+
return tokens
|
225
|
+
end
|
226
|
+
|
227
|
+
def deabstract(tokens)
|
228
|
+
tokens.select do |t|
|
229
|
+
t.abstracted_value == Token.special("INDENTATION") ||
|
230
|
+
t.abstracted_value == Token.special("WHITESPACE")
|
231
|
+
end.each do |token|
|
232
|
+
token.type = :space
|
233
|
+
token.value = ' '
|
234
|
+
end
|
235
|
+
|
236
|
+
return tokens
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
class SpaceRemover < AbstractorPiece
|
241
|
+
def abstract(tokens)
|
242
|
+
tokens.delete_if { |t| t.type == :space }
|
243
|
+
return tokens
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
class NewlineRemover < AbstractorPiece
|
248
|
+
def abstract(tokens)
|
249
|
+
tokens.delete_if { |t| t.type == :newline }
|
250
|
+
return tokens
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
class CommentRemover < AbstractorPiece
|
255
|
+
def abstract(tokens)
|
256
|
+
tokens.delete_if { |t| t.type == :comment }
|
257
|
+
return tokens
|
118
258
|
end
|
119
259
|
end
|
120
260
|
end
|
data/lib/code-lexer/config.rb
CHANGED
@@ -4,12 +4,14 @@ lexer:
|
|
4
4
|
identifier:
|
5
5
|
- "[$A-Za-z_][$A-Za-z0-9_]*"
|
6
6
|
comment:
|
7
|
-
- \/\/[^\n\r]*[\n\r]
|
7
|
+
- \/\/[^\n\r]*(?=[\n\r])
|
8
8
|
- \/\/.*$
|
9
9
|
- \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
|
10
10
|
string:
|
11
11
|
- \"([^"]|\\\")*\"
|
12
12
|
- \'[^']*\'
|
13
|
+
regex:
|
14
|
+
- \/([^/]|\\\/)*\/[gim]*
|
13
15
|
number:
|
14
16
|
- \-?[0-9]*\.[0-9]e\-?[0-9]+
|
15
17
|
- \-?[0-9]*\.[0-9]
|
@@ -19,13 +21,13 @@ lexer:
|
|
19
21
|
- \-?0[0-7]+
|
20
22
|
operator:
|
21
23
|
- (\=\=\=|\!\=\=)
|
22
|
-
- (
|
24
|
+
- (\<\=|\>\=|\=\=|\!\=|\=\>)
|
23
25
|
- (\&\&|\|\||\!)
|
24
26
|
- (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
|
25
27
|
- (\&|\||\~|\^|\<\<|\>\>)
|
26
28
|
- (\=|\+|\-|\/|\*|\%)
|
27
29
|
- (\.|\,|\:)
|
28
|
-
- (
|
30
|
+
- (\<|\>|\?)
|
29
31
|
parenthesis:
|
30
32
|
- (\(|\)|\[|\]|\{|\})
|
31
33
|
semicolon:
|
data/lib/code-lexer/lexer.rb
CHANGED
@@ -12,7 +12,7 @@ module CodeLexer
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
-
def lex(content)
|
15
|
+
def lex(content, abstractor = nil)
|
16
16
|
content = content.clone
|
17
17
|
tokens = []
|
18
18
|
while content.length > 0
|
@@ -23,17 +23,31 @@ module CodeLexer
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
return LexedContent.new(tokens)
|
26
|
+
return LexedContent.new(tokens, abstractor)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
class LexedContent
|
31
31
|
attr_reader :tokens
|
32
|
+
attr_reader :abstractor
|
32
33
|
|
33
|
-
def
|
34
|
+
def self.from_stream_string(stream, abstractor)
|
35
|
+
tokens = stream.split(" ").map { |t| Token.from_string(t) }
|
36
|
+
abstractor.deabstract!(tokens)
|
37
|
+
return LexedContent.new(tokens, abstractor)
|
38
|
+
end
|
39
|
+
|
40
|
+
def initialize(tokens, abstractor = nil)
|
34
41
|
@tokens = tokens
|
42
|
+
@abstractor = abstractor
|
43
|
+
|
44
|
+
@abstractor.abstract!(@tokens) if @abstractor
|
45
|
+
end
|
46
|
+
|
47
|
+
def reconstruct
|
48
|
+
@tokens.map { |t| t.value.to_s }.join("")
|
35
49
|
end
|
36
|
-
|
50
|
+
|
37
51
|
def token_lines
|
38
52
|
result = []
|
39
53
|
current_line = []
|
@@ -53,14 +67,73 @@ module CodeLexer
|
|
53
67
|
end
|
54
68
|
|
55
69
|
def token_stream(abstractor = nil)
|
56
|
-
abstractor.abstract!(@tokens) if abstractor
|
57
|
-
|
58
70
|
result = []
|
59
|
-
|
71
|
+
|
72
|
+
tokens = @tokens
|
73
|
+
if abstractor
|
74
|
+
tokens = tokens.map { |t| t.clone }
|
75
|
+
tokens.each { |t| t.reset_abstraction }
|
76
|
+
abstractor.abstract!(tokens)
|
77
|
+
end
|
78
|
+
|
79
|
+
tokens.each do |token|
|
60
80
|
result << token.abstracted_value
|
61
81
|
end
|
62
82
|
|
63
83
|
return result.join(" ")
|
64
84
|
end
|
85
|
+
|
86
|
+
def to_s
|
87
|
+
@tokens.map { |t| t.value }.join("")
|
88
|
+
end
|
89
|
+
|
90
|
+
def dump(filename, mode = "w", force = false)
|
91
|
+
if mode.downcase.include?("w") && !force
|
92
|
+
if FileTest.exist?(filename) || FileTest.exist?(lexdata(filename))
|
93
|
+
raise "Destination filename or lexdata filename already exist."
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
File.open(filename, mode) do |f|
|
98
|
+
f << self.token_stream + "\n"
|
99
|
+
end
|
100
|
+
|
101
|
+
File.open(lexdata(filename), "#{mode}b") do |f|
|
102
|
+
f << Marshal.dump(@abstractor)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.load(file_or_filename, lexdata_or_lexdata_filename = nil)
|
107
|
+
if file_or_filename.is_a?(String) && (lexdata_or_lexdata_filename.is_a?(String) || !lexdata_or_lexdata_filename)
|
108
|
+
unless lexdata_or_lexdata_filename
|
109
|
+
return self.load_filename(file_or_filename)
|
110
|
+
else
|
111
|
+
return self.load_filename(file_or_filename, lexdata_or_lexdata_filename)
|
112
|
+
end
|
113
|
+
elsif file_or_filename.is_a?(File) && lexdata_or_lexdata_filename.is_a?(File)
|
114
|
+
return self.load_file(file_or_filename, lexdata_or_lexdata_filename)
|
115
|
+
else
|
116
|
+
raise "Unable to call with the provided input types: expected (String, String), (String), or (File, File)"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.load_filename(filename, lexdata_filename = filename + ".lexdata")
|
121
|
+
File.open(filename, "r") do |file|
|
122
|
+
File.open(lexdata_filename, "rb") do |lexdata_file|
|
123
|
+
return LexedContent.load_file(file, lexdata_file)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.load_file(file, lexdata_file)
|
129
|
+
line = file.readline
|
130
|
+
abstractor = Marshal.load(lexdata_file)
|
131
|
+
return LexedContent.from_stream_string(line, abstractor)
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
def lexdata(filename)
|
136
|
+
filename + ".lexdata"
|
137
|
+
end
|
65
138
|
end
|
66
139
|
end
|
data/lib/code-lexer/token.rb
CHANGED
@@ -11,6 +11,19 @@ module CodeLexer
|
|
11
11
|
attr_accessor :value
|
12
12
|
attr_accessor :abstracted_value
|
13
13
|
|
14
|
+
def self.from_string(string)
|
15
|
+
unless string.start_with?(SPECIAL_TOKEN_OPEN)
|
16
|
+
value = string
|
17
|
+
else
|
18
|
+
value = nil
|
19
|
+
end
|
20
|
+
|
21
|
+
token = Token.new(:unknown, value)
|
22
|
+
token.abstracted_value = string
|
23
|
+
|
24
|
+
return token
|
25
|
+
end
|
26
|
+
|
14
27
|
def initialize(type, value)
|
15
28
|
@type = type
|
16
29
|
self.value = value
|
@@ -18,13 +31,7 @@ module CodeLexer
|
|
18
31
|
|
19
32
|
def value=(v)
|
20
33
|
@value = v
|
21
|
-
|
22
|
-
@abstracted_value = Token.special("NEWLINE")
|
23
|
-
elsif v =~ /\s/
|
24
|
-
@abstracted_value = Token.special(v.gsub(/\s/, "·"))
|
25
|
-
else
|
26
|
-
@abstracted_value = v
|
27
|
-
end
|
34
|
+
self.reset_abstraction
|
28
35
|
end
|
29
36
|
|
30
37
|
def to_s
|
@@ -38,5 +45,15 @@ module CodeLexer
|
|
38
45
|
def ==(oth)
|
39
46
|
@type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
|
40
47
|
end
|
48
|
+
|
49
|
+
def reset_abstraction
|
50
|
+
if @type == :newline
|
51
|
+
@abstracted_value = Token.special("NEWLINE")
|
52
|
+
elsif @value =~ /\s/
|
53
|
+
@abstracted_value = Token.special(@value.gsub(/\s/, "·"))
|
54
|
+
else
|
55
|
+
@abstracted_value = @value.clone
|
56
|
+
end
|
57
|
+
end
|
41
58
|
end
|
42
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: code-lexer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Simone Scalabrino
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: code-assertions
|
@@ -62,7 +62,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
62
62
|
- !ruby/object:Gem::Version
|
63
63
|
version: '0'
|
64
64
|
requirements: []
|
65
|
-
rubygems_version: 3.
|
65
|
+
rubygems_version: 3.3.3
|
66
66
|
signing_key:
|
67
67
|
specification_version: 4
|
68
68
|
summary: Simple source code lexer
|