code-lexer 0.3 → 0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d118a62a0320daf8a38c03fc9133dddbb6c5ab55d1386c323a53fb9b7ace59e
4
- data.tar.gz: 6c5e4401890025c6b379e289c9b63a581d4a3731ff62f5f903e95ec153145274
3
+ metadata.gz: 63422a570f2d8b96e95ee2dfce7f67b797df7aabeca3b1a92c67535d497bd5f7
4
+ data.tar.gz: 6b2abf7eaf8cc3518de8998b67be1fd4823c4a59db2c879aec04561e6453ac3e
5
5
  SHA512:
6
- metadata.gz: 116f92a402b3f4077a357a35e3241934b1acc4760569e57ffa14fa07dbd7276f8d020d4c88d4aa8191c1cfcb48140cc4401dcfdb5a8f0e6328383a80afb0041c
7
- data.tar.gz: be5d36fe5a1434f9771194584ebe9b45fdf8fdd257e1ab277cd180940448459ca6791e5140a9831078d24542d61185aa89014e37c13f44dfa14e900c33d56e90
6
+ metadata.gz: 27a3cdf2d95c3e832c48988441e1f5eb466ec595399cbf93676fccaf6b5c6edfb58337782742ebcc1bae6b8052ab05ae3d61e01ad297706d00298a4b22160ff2
7
+ data.tar.gz: 9b2e9fac2f678751f11018a13589b692871a86b4ab652524c3af9f5c042266f4398ee902e9310b77f5ded49ae8357503d99bf6740ec42de914f1da283817aa75
@@ -1,11 +1,14 @@
1
1
  require_relative 'token'
2
2
 
3
3
  module CodeLexer
4
- class Abstractor
5
- attr_reader :dictionary
6
-
7
- def initialize(dictionary=[])
8
- @dictionary = ["NOOP"] + dictionary
4
+ class Abstractor
5
+ def initialize(identifiers_dictionary = [], strings_dictionary = [], numbers_dictionary = [])
6
+ @dictionary = {}
7
+ @dictionary[:identifiers] = ['NOOP'] + identifiers_dictionary
8
+ @dictionary[:strings] = strings_dictionary
9
+ @dictionary[:numbers] = numbers_dictionary
10
+
11
+ @abstractor_pieces = []
9
12
  end
10
13
 
11
14
  def abstract_everything
@@ -18,103 +21,240 @@ module CodeLexer
18
21
  return self
19
22
  end
20
23
 
24
+ def dictionary
25
+ warn "[DEPRECATION] The method CodeLexer::Abstractor#dictionary is deprecated; used CodeLexer::Abstractor#identifiers_dictionary instead"
26
+ self.identifiers_dictionary
27
+ end
28
+
29
+ def identifiers_dictionary
30
+ @dictionary[:identifiers]
31
+ end
32
+
33
+ def strings_dictionary
34
+ @dictionary[:strings]
35
+ end
36
+
37
+ def numbers_dictionary
38
+ @dictionary[:numbers]
39
+ end
40
+
41
+ def dictionaries
42
+ @dictionary
43
+ end
44
+
21
45
  def abstract_identifiers
22
- @abstract_identifiers = true
46
+ @abstractor_pieces << IdentifierAbstractor.new(self)
23
47
  return self
24
48
  end
25
49
 
26
50
  def abstract_numbers
27
- @abstract_numbers = true
51
+ @abstractor_pieces << NumberAbstractor.new(self)
28
52
  return self
29
53
  end
30
54
 
31
55
  def abstract_comments
32
- @abstract_comments = true
56
+ @abstractor_pieces << CommentAbstractor.new(self)
33
57
  return self
34
58
  end
35
59
 
36
60
  def abstract_strings
37
- @abstract_strings = true
61
+ @abstractor_pieces << StringAbstractor.new(self)
38
62
  return self
39
63
  end
40
64
 
41
65
  def abstract_spaces
42
- @abstract_spaces = true
66
+ @abstractor_pieces << SpaceAbstractor.new(self)
43
67
  return self
44
68
  end
45
69
 
46
70
  def remove_spaces
47
- @remove_spaces = true
71
+ @abstractor_pieces << SpaceRemover.new(self)
48
72
  return self
49
73
  end
50
74
 
51
75
  def remove_newlines
52
- @remove_newlines = true
76
+ @abstractor_pieces << NewlineRemover.new(self)
53
77
  return self
54
78
  end
55
79
 
56
80
  def remove_comments
57
- @remove_comments = true
81
+ @abstractor_pieces << CommentRemover.new(self)
58
82
  return self
59
83
  end
60
84
 
61
85
  def abstract!(tokens)
62
- if @abstract_identifiers
63
- identifier_tokens = tokens.select { |t| t.type == :identifier }
64
- identifiers = identifier_tokens.map { |id| id.value }.uniq
65
-
66
- identifiers.each do |id|
67
- if @dictionary.include?(id)
68
- abstracted_id = @dictionary.index(id)
69
- else
70
- abstracted_id = @dictionary.size
71
- @dictionary << id
72
- end
73
-
74
- identifier_tokens.select { |t| t.value == id }.each do |matching_token|
75
- matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
76
- end
77
- end
86
+ @abstractor_pieces.each do |abstractor_piece|
87
+ tokens = abstractor_piece.abstract(tokens)
78
88
  end
79
89
 
80
- if @remove_comments
81
- tokens.delete_if { |t| t.type == :comment }
82
- elsif @abstract_comments
83
- tokens.select { |t| t.type == :comment }.each do |comment_token|
84
- comment_token.abstracted_value = Token.special("COMMENT")
85
- end
90
+ return self
91
+ end
92
+
93
+ def deabstract!(tokens)
94
+ @abstractor_pieces.each do |abstractor_piece|
95
+ tokens = abstractor_piece.deabstract(tokens)
86
96
  end
87
97
 
88
- if @abstract_numbers
89
- tokens.select { |t| t.type == :number }.each do |number_token|
90
- number_token.abstracted_value = Token.special("NUMBER")
98
+ return self
99
+ end
100
+ end
101
+
102
+ class AbstractorPiece
103
+ def initialize(abstractor)
104
+ @abstractor = abstractor
105
+ end
106
+
107
+ def abstract(tokens)
108
+ return tokens
109
+ end
110
+
111
+ def deabstract(tokens)
112
+ return tokens
113
+ end
114
+ end
115
+
116
+ class IdentifierAbstractor < AbstractorPiece
117
+ def abstract(tokens)
118
+ identifier_tokens = tokens.select { |t| t.type == :identifier }
119
+ identifiers = identifier_tokens.map { |id| id.value }.uniq
120
+
121
+ identifiers.each do |id|
122
+ if @abstractor.identifiers_dictionary.include?(id)
123
+ abstracted_id = @abstractor.identifiers_dictionary.index(id)
124
+ else
125
+ abstracted_id = @abstractor.identifiers_dictionary.size
126
+ @abstractor.identifiers_dictionary << id
127
+ end
128
+
129
+ identifier_tokens.select { |t| t.value == id }.each do |matching_token|
130
+ matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
91
131
  end
92
132
  end
93
133
 
94
- if @abstract_strings
95
- tokens.select { |t| t.type == :string }.each do |string_token|
96
- string_token.abstracted_value = Token.special("STRING")
97
- end
134
+ return tokens
135
+ end
136
+
137
+ def deabstract(tokens)
138
+ tokens.select { |t| t.abstracted_value.match?(/.ID[0-9]+./) }.each do |token|
139
+ id = token.abstracted_value.scan(/.ID([0-9]+)./).flatten[0].to_i
140
+
141
+ token.type = :identifier
142
+ token.value = @abstractor.identifiers_dictionary[id]
98
143
  end
99
144
 
100
- if @remove_newlines
101
- tokens.delete_if { |t| t.type == :newline }
145
+ return tokens
146
+ end
147
+ end
148
+
149
+ class NumberAbstractor < AbstractorPiece
150
+ def abstract(tokens)
151
+ tokens.select { |t| t.type == :number }.each do |number_token|
152
+ number_token.abstracted_value = Token.special("NUMBER")
153
+ @abstractor.numbers_dictionary << number_token.value
102
154
  end
103
155
 
104
- if @remove_spaces
105
- tokens.delete_if { |t| t.type == :space }
106
- elsif @abstract_spaces
107
- tokens.select { |t| t.type == :space }.each do |space_token|
108
- previous_index = tokens.index(space_token) - 1
109
- if previous_index < 0 || tokens[previous_index].type == :newline
110
- space_token.abstracted_value = Token.special("INDENTATION")
111
- else
112
- space_token.abstracted_value = Token.special("WHITESPACE")
113
- end
156
+ return tokens
157
+ end
158
+
159
+ def deabstract(tokens)
160
+ id = 0
161
+ tokens.select { |t| t.abstracted_value == Token.special("NUMBER") }.each do |token|
162
+ token.type = :number
163
+ token.value = @abstractor.numbers_dictionary[id]
164
+
165
+ id += 1
166
+ end
167
+
168
+ return tokens
169
+ end
170
+ end
171
+
172
+ class StringAbstractor < AbstractorPiece
173
+ def abstract(tokens)
174
+ tokens.select { |t| t.type == :string }.each do |string_token|
175
+ string_token.abstracted_value = Token.special("STRING")
176
+ @abstractor.strings_dictionary << string_token.value
177
+ end
178
+
179
+ return tokens
180
+ end
181
+
182
+ def deabstract(tokens)
183
+ id = 0
184
+ tokens.select { |t| t.abstracted_value == Token.special("STRING") }.each do |token|
185
+ token.type = :string
186
+ token.value = '"' + @abstractor.strings_dictionary[id] + '"'
187
+
188
+ id += 1
189
+ end
190
+
191
+ return tokens
192
+ end
193
+ end
194
+
195
+ class CommentAbstractor < AbstractorPiece
196
+ def abstract(tokens)
197
+ tokens.select { |t| t.type == :comment }.each do |comment_token|
198
+ comment_token.abstracted_value = Token.special("COMMENT")
199
+ end
200
+ return tokens
201
+ end
202
+
203
+ def deabstract(tokens)
204
+ tokens.select { |t| t.abstracted_value == Token.special("COMMENT") }.each do |token|
205
+ token.type = :comment
206
+ token.value = 'Unknown comment'
207
+ end
208
+
209
+ return tokens
210
+ end
211
+ end
212
+
213
+ class SpaceAbstractor < AbstractorPiece
214
+ def abstract(tokens)
215
+ tokens.select { |t| t.type == :space }.each do |space_token|
216
+ previous_index = tokens.index(space_token) - 1
217
+ if previous_index < 0 || tokens[previous_index].type == :newline
218
+ space_token.abstracted_value = Token.special("INDENTATION")
219
+ else
220
+ space_token.abstracted_value = Token.special("WHITESPACE")
114
221
  end
115
222
  end
116
223
 
117
- return self
224
+ return tokens
225
+ end
226
+
227
+ def deabstract(tokens)
228
+ tokens.select do |t|
229
+ t.abstracted_value == Token.special("INDENTATION") ||
230
+ t.abstracted_value == Token.special("WHITESPACE")
231
+ end.each do |token|
232
+ token.type = :space
233
+ token.value = ' '
234
+ end
235
+
236
+ return tokens
237
+ end
238
+ end
239
+
240
+ class SpaceRemover < AbstractorPiece
241
+ def abstract(tokens)
242
+ tokens.delete_if { |t| t.type == :space }
243
+ return tokens
244
+ end
245
+ end
246
+
247
+ class NewlineRemover < AbstractorPiece
248
+ def abstract(tokens)
249
+ tokens.delete_if { |t| t.type == :newline }
250
+ return tokens
251
+ end
252
+ end
253
+
254
+ class CommentRemover < AbstractorPiece
255
+ def abstract(tokens)
256
+ tokens.delete_if { |t| t.type == :comment }
257
+ return tokens
118
258
  end
119
259
  end
120
260
  end
@@ -1,3 +1,5 @@
1
+ require 'yaml'
2
+
1
3
  module CodeLexer
2
4
  class Config
3
5
  attr_reader :rules
@@ -5,7 +7,7 @@ module CodeLexer
5
7
  @config = File.basename(path)
6
8
  @rules = []
7
9
 
8
- load_rules(File.read(path))
10
+ load_rules(path)
9
11
  end
10
12
 
11
13
  def matching_rule(text)
@@ -25,11 +27,14 @@ module CodeLexer
25
27
 
26
28
  private
27
29
  def load_rules(content)
28
- content.split("\n").each do |line|
29
- name, regex = line.split(":", 2)
30
- regex = Regexp.new("^" + regex)
31
-
32
- @rules << [name.to_sym, regex]
30
+ parsed = YAML.load_file(content)
31
+
32
+
33
+ parsed['lexer'].each do |name, regexs|
34
+ regexs.each do |regex|
35
+ regex = Regexp.new("^" + regex, Regexp::MULTILINE)
36
+ @rules << [name.to_sym, regex]
37
+ end
33
38
  end
34
39
 
35
40
  @rules << [:other, /./]
@@ -0,0 +1,38 @@
1
+ lexer:
2
+ keyword:
3
+ - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
4
+ identifier:
5
+ - "[$A-Za-z_][$A-Za-z0-9_]*"
6
+ comment:
7
+ - \/\/[^\n\r]*(?=[\n\r])
8
+ - \/\/.*$
9
+ - \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
10
+ string:
11
+ - \"([^"]|\\\")*\"
12
+ - \'[^']*\'
13
+ regex:
14
+ - \/([^/]|\\\/)*\/[gim]*
15
+ number:
16
+ - \-?[0-9]*\.[0-9]e\-?[0-9]+
17
+ - \-?[0-9]*\.[0-9]
18
+ - \-?[1-9][0-9]*
19
+ - \-?0[Xx][0-9A-Fa-f]+
20
+ - \-?[0-9]
21
+ - \-?0[0-7]+
22
+ operator:
23
+ - (\=\=\=|\!\=\=)
24
+ - (\<\=|\>\=|\=\=|\!\=|\=\>)
25
+ - (\&\&|\|\||\!)
26
+ - (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
27
+ - (\&|\||\~|\^|\<\<|\>\>)
28
+ - (\=|\+|\-|\/|\*|\%)
29
+ - (\.|\,|\:)
30
+ - (\<|\>|\?)
31
+ parenthesis:
32
+ - (\(|\)|\[|\]|\{|\})
33
+ semicolon:
34
+ - \;
35
+ newline:
36
+ - "[\\n\\r]"
37
+ space:
38
+ - \s+
@@ -12,7 +12,7 @@ module CodeLexer
12
12
  end
13
13
  end
14
14
 
15
- def lex(content)
15
+ def lex(content, abstractor = nil)
16
16
  content = content.clone
17
17
  tokens = []
18
18
  while content.length > 0
@@ -23,17 +23,31 @@ module CodeLexer
23
23
  end
24
24
  end
25
25
 
26
- return LexedContent.new(tokens)
26
+ return LexedContent.new(tokens, abstractor)
27
27
  end
28
28
  end
29
29
 
30
30
  class LexedContent
31
31
  attr_reader :tokens
32
+ attr_reader :abstractor
32
33
 
33
- def initialize(tokens)
34
+ def self.from_stream_string(stream, abstractor)
35
+ tokens = stream.split(" ").map { |t| Token.from_string(t) }
36
+ abstractor.deabstract!(tokens)
37
+ return LexedContent.new(tokens, abstractor)
38
+ end
39
+
40
+ def initialize(tokens, abstractor = nil)
34
41
  @tokens = tokens
42
+ @abstractor = abstractor
43
+
44
+ @abstractor.abstract!(@tokens) if @abstractor
45
+ end
46
+
47
+ def reconstruct
48
+ @tokens.map { |t| t.value.to_s }.join("")
35
49
  end
36
-
50
+
37
51
  def token_lines
38
52
  result = []
39
53
  current_line = []
@@ -53,14 +67,73 @@ module CodeLexer
53
67
  end
54
68
 
55
69
  def token_stream(abstractor = nil)
56
- abstractor.abstract!(@tokens) if abstractor
57
-
58
70
  result = []
59
- @tokens.each do |token|
71
+
72
+ tokens = @tokens
73
+ if abstractor
74
+ tokens = tokens.map { |t| t.clone }
75
+ tokens.each { |t| t.reset_abstraction }
76
+ abstractor.abstract!(tokens)
77
+ end
78
+
79
+ tokens.each do |token|
60
80
  result << token.abstracted_value
61
81
  end
62
82
 
63
83
  return result.join(" ")
64
84
  end
85
+
86
+ def to_s
87
+ @tokens.map { |t| t.value }.join("")
88
+ end
89
+
90
+ def dump(filename, mode = "w", force = false)
91
+ if mode.downcase.include?("w") && !force
92
+ if FileTest.exist?(filename) || FileTest.exist?(lexdata(filename))
93
+ raise "Destination filename or lexdata filename already exist."
94
+ end
95
+ end
96
+
97
+ File.open(filename, mode) do |f|
98
+ f << self.token_stream + "\n"
99
+ end
100
+
101
+ File.open(lexdata(filename), "#{mode}b") do |f|
102
+ f << Marshal.dump(@abstractor)
103
+ end
104
+ end
105
+
106
+ def self.load(file_or_filename, lexdata_or_lexdata_filename = nil)
107
+ if file_or_filename.is_a?(String) && (lexdata_or_lexdata_filename.is_a?(String) || !lexdata_or_lexdata_filename)
108
+ unless lexdata_or_lexdata_filename
109
+ return self.load_filename(file_or_filename)
110
+ else
111
+ return self.load_filename(file_or_filename, lexdata_or_lexdata_filename)
112
+ end
113
+ elsif file_or_filename.is_a?(File) && lexdata_or_lexdata_filename.is_a?(File)
114
+ return self.load_file(file_or_filename, lexdata_or_lexdata_filename)
115
+ else
116
+ raise "Unable to call with the provided input types: expected (String, String), (String), or (File, File)"
117
+ end
118
+ end
119
+
120
+ def self.load_filename(filename, lexdata_filename = filename + ".lexdata")
121
+ File.open(filename, "r") do |file|
122
+ File.open(lexdata_filename, "rb") do |lexdata_file|
123
+ return LexedContent.load_file(file, lexdata_file)
124
+ end
125
+ end
126
+ end
127
+
128
+ def self.load_file(file, lexdata_file)
129
+ line = file.readline
130
+ abstractor = Marshal.load(lexdata_file)
131
+ return LexedContent.from_stream_string(line, abstractor)
132
+ end
133
+
134
+ private
135
+ def lexdata(filename)
136
+ filename + ".lexdata"
137
+ end
65
138
  end
66
139
  end
@@ -11,6 +11,19 @@ module CodeLexer
11
11
  attr_accessor :value
12
12
  attr_accessor :abstracted_value
13
13
 
14
+ def self.from_string(string)
15
+ unless string.start_with?(SPECIAL_TOKEN_OPEN)
16
+ value = string
17
+ else
18
+ value = nil
19
+ end
20
+
21
+ token = Token.new(:unknown, value)
22
+ token.abstracted_value = string
23
+
24
+ return token
25
+ end
26
+
14
27
  def initialize(type, value)
15
28
  @type = type
16
29
  self.value = value
@@ -18,13 +31,7 @@ module CodeLexer
18
31
 
19
32
  def value=(v)
20
33
  @value = v
21
- if @type == :newline
22
- @abstracted_value = Token.special("NEWLINE")
23
- elsif v =~ /\s/
24
- @abstracted_value = Token.special(v.gsub(/\s/, "·"))
25
- else
26
- @abstracted_value = v
27
- end
34
+ self.reset_abstraction
28
35
  end
29
36
 
30
37
  def to_s
@@ -38,5 +45,15 @@ module CodeLexer
38
45
  def ==(oth)
39
46
  @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
40
47
  end
48
+
49
+ def reset_abstraction
50
+ if @type == :newline
51
+ @abstracted_value = Token.special("NEWLINE")
52
+ elsif @value =~ /\s/
53
+ @abstracted_value = Token.special(@value.gsub(/\s/, "·"))
54
+ else
55
+ @abstracted_value = @value.clone
56
+ end
57
+ end
41
58
  end
42
59
  end
data/lib/code-lexer.rb CHANGED
@@ -5,6 +5,6 @@ require_relative 'code-lexer/token'
5
5
 
6
6
  module CodeLexer
7
7
  def self.get(language)
8
- return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.clex")
8
+ return Lexer.new("#{File.dirname(File.expand_path(__FILE__))}/code-lexer/languages/#{language}.yml")
9
9
  end
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: code-lexer
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.3'
4
+ version: '0.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-28 00:00:00.000000000 Z
11
+ date: 2022-01-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -40,7 +40,7 @@ files:
40
40
  - lib/code-lexer.rb
41
41
  - lib/code-lexer/abstractor.rb
42
42
  - lib/code-lexer/config.rb
43
- - lib/code-lexer/languages/javascript.clex
43
+ - lib/code-lexer/languages/javascript.yml
44
44
  - lib/code-lexer/lexer.rb
45
45
  - lib/code-lexer/token.rb
46
46
  homepage: https://github.com/intersimone999/code-lexer
@@ -62,7 +62,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
62
  - !ruby/object:Gem::Version
63
63
  version: '0'
64
64
  requirements: []
65
- rubygems_version: 3.2.32
65
+ rubygems_version: 3.3.3
66
66
  signing_key:
67
67
  specification_version: 4
68
68
  summary: Simple source code lexer
@@ -1,25 +0,0 @@
1
- keyword:(?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
2
- identifier:[$A-Za-z_][$A-Za-z0-9_]*
3
- comment:\/\/[^.]*[\n\r]
4
- comment:\/\/[^.]*$
5
- comment:\/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
6
- string:\"([^"]|\\\")*\"
7
- string:\'[^']*\'
8
- number:\-?[0-9]
9
- number:\-?[1-9][0-9]*
10
- number:\-?[0-9]*\.[0-9]
11
- number:\-?[0-9]*\.[0-9]e\-?[0-9]+
12
- number:\-?0[Xx][0-9A-Fa-f]+
13
- number:\-?0[0-7]+
14
- operator:(\=\=\=|\!\=\=)
15
- operator:(\<\=|\>\=|\=\=|\!\=)
16
- operator:(\&\&|\|\||\!)
17
- operator:(\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
18
- operator:(\&|\||\~|\^|\<\<|\>\>)
19
- operator:(\=|\+|\-|\/|\*|\%)
20
- operator:(\.|\,|\:)
21
- operator:(\<|\>)
22
- parenthesis:(\(|\)|\[|\]|\{|\})
23
- semicolon:\;
24
- newline:[\n\r]
25
- space:\s+