code-lexer 0.6 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0be86a493e60a21bc3c8d16598c6f6af914a2521255bb604b01ffa9410ff399f
4
- data.tar.gz: 46e60be81ceda1cc9621ef7519189a45cceaaa54afa27313a5c6d4c8be0e8377
3
+ metadata.gz: 28e3de74936d4c5e81abc995cec85c4831c2383eedc098ef9097c45002e59bbb
4
+ data.tar.gz: a7f113035e970f213de2e0301454b6851eba80f661adb4ce4720545e280fbcef
5
5
  SHA512:
6
- metadata.gz: adcc4ee93e5c58da53df775584dbd4de485d8af7af232e2a7aa379a1ec09c1e928a804139983d585773f3b837bdb6f9fe95623107dfceaa0fb3afc2ddd5a7593
7
- data.tar.gz: e9ee16aa762e15515d58930b29b5c8f784baee483f40fa1ae520734a0efc98f7680ffa5a4f6c88de3c261b836e17bb9e529aa4adc55314928d3a60f7f0d0d2da
6
+ metadata.gz: 1ccf664386fd4ca8b505658d0059d1d44a4b428b84ab79ed5ea1f9d12219f7daa6261d1d058b797080d8719c1d842fad47441d2b99da578c4691c00e5e109efe
7
+ data.tar.gz: 65914ad6a9f937ce884a5e7c737e4e9857b5be9aa5dc81ba912d6c25a44deff5efeaff2b11b1a0cfe6ad7b77859a2862fe745d9a280b3f639be2e3cb3a678d6e
@@ -1,11 +1,14 @@
1
1
  require_relative 'token'
2
2
 
3
3
  module CodeLexer
4
- class Abstractor
5
- attr_reader :dictionary
6
-
7
- def initialize(dictionary=[])
8
- @dictionary = ["NOOP"] + dictionary
4
+ class Abstractor
5
+ def initialize(identifiers_dictionary = [], strings_dictionary = [], numbers_dictionary = [])
6
+ @dictionary = {}
7
+ @dictionary[:identifiers] = ['NOOP'] + identifiers_dictionary
8
+ @dictionary[:strings] = strings_dictionary
9
+ @dictionary[:numbers] = numbers_dictionary
10
+
11
+ @abstractor_pieces = []
9
12
  end
10
13
 
11
14
  def abstract_everything
@@ -18,103 +21,240 @@ module CodeLexer
18
21
  return self
19
22
  end
20
23
 
24
+ def dictionary
25
+ warn "[DEPRECATION] The method CodeLexer::Abstractor#dictionary is deprecated; used CodeLexer::Abstractor#identifiers_dictionary instead"
26
+ self.identifiers_dictionary
27
+ end
28
+
29
+ def identifiers_dictionary
30
+ @dictionary[:identifiers]
31
+ end
32
+
33
+ def strings_dictionary
34
+ @dictionary[:strings]
35
+ end
36
+
37
+ def numbers_dictionary
38
+ @dictionary[:numbers]
39
+ end
40
+
41
+ def dictionaries
42
+ @dictionary
43
+ end
44
+
21
45
  def abstract_identifiers
22
- @abstract_identifiers = true
46
+ @abstractor_pieces << IdentifierAbstractor.new(self)
23
47
  return self
24
48
  end
25
49
 
26
50
  def abstract_numbers
27
- @abstract_numbers = true
51
+ @abstractor_pieces << NumberAbstractor.new(self)
28
52
  return self
29
53
  end
30
54
 
31
55
  def abstract_comments
32
- @abstract_comments = true
56
+ @abstractor_pieces << CommentAbstractor.new(self)
33
57
  return self
34
58
  end
35
59
 
36
60
  def abstract_strings
37
- @abstract_strings = true
61
+ @abstractor_pieces << StringAbstractor.new(self)
38
62
  return self
39
63
  end
40
64
 
41
65
  def abstract_spaces
42
- @abstract_spaces = true
66
+ @abstractor_pieces << SpaceAbstractor.new(self)
43
67
  return self
44
68
  end
45
69
 
46
70
  def remove_spaces
47
- @remove_spaces = true
71
+ @abstractor_pieces << SpaceRemover.new(self)
48
72
  return self
49
73
  end
50
74
 
51
75
  def remove_newlines
52
- @remove_newlines = true
76
+ @abstractor_pieces << NewlineRemover.new(self)
53
77
  return self
54
78
  end
55
79
 
56
80
  def remove_comments
57
- @remove_comments = true
81
+ @abstractor_pieces << CommentRemover.new(self)
58
82
  return self
59
83
  end
60
84
 
61
85
  def abstract!(tokens)
62
- if @abstract_identifiers
63
- identifier_tokens = tokens.select { |t| t.type == :identifier }
64
- identifiers = identifier_tokens.map { |id| id.value }.uniq
65
-
66
- identifiers.each do |id|
67
- if @dictionary.include?(id)
68
- abstracted_id = @dictionary.index(id)
69
- else
70
- abstracted_id = @dictionary.size
71
- @dictionary << id
72
- end
73
-
74
- identifier_tokens.select { |t| t.value == id }.each do |matching_token|
75
- matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
76
- end
77
- end
86
+ @abstractor_pieces.each do |abstractor_piece|
87
+ tokens = abstractor_piece.abstract(tokens)
78
88
  end
79
89
 
80
- if @remove_comments
81
- tokens.delete_if { |t| t.type == :comment }
82
- elsif @abstract_comments
83
- tokens.select { |t| t.type == :comment }.each do |comment_token|
84
- comment_token.abstracted_value = Token.special("COMMENT")
85
- end
90
+ return self
91
+ end
92
+
93
+ def deabstract!(tokens)
94
+ @abstractor_pieces.each do |abstractor_piece|
95
+ tokens = abstractor_piece.deabstract(tokens)
86
96
  end
87
97
 
88
- if @abstract_numbers
89
- tokens.select { |t| t.type == :number }.each do |number_token|
90
- number_token.abstracted_value = Token.special("NUMBER")
98
+ return self
99
+ end
100
+ end
101
+
102
+ class AbstractorPiece
103
+ def initialize(abstractor)
104
+ @abstractor = abstractor
105
+ end
106
+
107
+ def abstract(tokens)
108
+ return tokens
109
+ end
110
+
111
+ def deabstract(tokens)
112
+ return tokens
113
+ end
114
+ end
115
+
116
+ class IdentifierAbstractor < AbstractorPiece
117
+ def abstract(tokens)
118
+ identifier_tokens = tokens.select { |t| t.type == :identifier }
119
+ identifiers = identifier_tokens.map { |id| id.value }.uniq
120
+
121
+ identifiers.each do |id|
122
+ if @abstractor.identifiers_dictionary.include?(id)
123
+ abstracted_id = @abstractor.identifiers_dictionary.index(id)
124
+ else
125
+ abstracted_id = @abstractor.identifiers_dictionary.size
126
+ @abstractor.identifiers_dictionary << id
127
+ end
128
+
129
+ identifier_tokens.select { |t| t.value == id }.each do |matching_token|
130
+ matching_token.abstracted_value = Token.special("ID#{abstracted_id}")
91
131
  end
92
132
  end
93
133
 
94
- if @abstract_strings
95
- tokens.select { |t| t.type == :string }.each do |string_token|
96
- string_token.abstracted_value = Token.special("STRING")
97
- end
134
+ return tokens
135
+ end
136
+
137
+ def deabstract(tokens)
138
+ tokens.select { |t| t.abstracted_value.match?(/.ID[0-9]+./) }.each do |token|
139
+ id = token.abstracted_value.scan(/.ID([0-9]+)./).flatten[0].to_i
140
+
141
+ token.type = :identifier
142
+ token.value = @abstractor.identifiers_dictionary[id]
98
143
  end
99
144
 
100
- if @remove_newlines
101
- tokens.delete_if { |t| t.type == :newline }
145
+ return tokens
146
+ end
147
+ end
148
+
149
+ class NumberAbstractor < AbstractorPiece
150
+ def abstract(tokens)
151
+ tokens.select { |t| t.type == :number }.each do |number_token|
152
+ number_token.abstracted_value = Token.special("NUMBER")
153
+ @abstractor.numbers_dictionary << number_token.value
102
154
  end
103
155
 
104
- if @remove_spaces
105
- tokens.delete_if { |t| t.type == :space }
106
- elsif @abstract_spaces
107
- tokens.select { |t| t.type == :space }.each do |space_token|
108
- previous_index = tokens.index(space_token) - 1
109
- if previous_index < 0 || tokens[previous_index].type == :newline
110
- space_token.abstracted_value = Token.special("INDENTATION")
111
- else
112
- space_token.abstracted_value = Token.special("WHITESPACE")
113
- end
156
+ return tokens
157
+ end
158
+
159
+ def deabstract(tokens)
160
+ id = 0
161
+ tokens.select { |t| t.abstracted_value == Token.special("NUMBER") }.each do |token|
162
+ token.type = :number
163
+ token.value = @abstractor.numbers_dictionary[id]
164
+
165
+ id += 1
166
+ end
167
+
168
+ return tokens
169
+ end
170
+ end
171
+
172
+ class StringAbstractor < AbstractorPiece
173
+ def abstract(tokens)
174
+ tokens.select { |t| t.type == :string }.each do |string_token|
175
+ string_token.abstracted_value = Token.special("STRING")
176
+ @abstractor.strings_dictionary << string_token.value
177
+ end
178
+
179
+ return tokens
180
+ end
181
+
182
+ def deabstract(tokens)
183
+ id = 0
184
+ tokens.select { |t| t.abstracted_value == Token.special("STRING") }.each do |token|
185
+ token.type = :string
186
+ token.value = '"' + @abstractor.strings_dictionary[id] + '"'
187
+
188
+ id += 1
189
+ end
190
+
191
+ return tokens
192
+ end
193
+ end
194
+
195
+ class CommentAbstractor < AbstractorPiece
196
+ def abstract(tokens)
197
+ tokens.select { |t| t.type == :comment }.each do |comment_token|
198
+ comment_token.abstracted_value = Token.special("COMMENT")
199
+ end
200
+ return tokens
201
+ end
202
+
203
+ def deabstract(tokens)
204
+ tokens.select { |t| t.abstracted_value == Token.special("COMMENT") }.each do |token|
205
+ token.type = :comment
206
+ token.value = 'Unknown comment'
207
+ end
208
+
209
+ return tokens
210
+ end
211
+ end
212
+
213
+ class SpaceAbstractor < AbstractorPiece
214
+ def abstract(tokens)
215
+ tokens.select { |t| t.type == :space }.each do |space_token|
216
+ previous_index = tokens.index(space_token) - 1
217
+ if previous_index < 0 || tokens[previous_index].type == :newline
218
+ space_token.abstracted_value = Token.special("INDENTATION")
219
+ else
220
+ space_token.abstracted_value = Token.special("WHITESPACE")
114
221
  end
115
222
  end
116
223
 
117
- return self
224
+ return tokens
225
+ end
226
+
227
+ def deabstract(tokens)
228
+ tokens.select do |t|
229
+ t.abstracted_value == Token.special("INDENTATION") ||
230
+ t.abstracted_value == Token.special("WHITESPACE")
231
+ end.each do |token|
232
+ token.type = :space
233
+ token.value = ' '
234
+ end
235
+
236
+ return tokens
237
+ end
238
+ end
239
+
240
+ class SpaceRemover < AbstractorPiece
241
+ def abstract(tokens)
242
+ tokens.delete_if { |t| t.type == :space }
243
+ return tokens
244
+ end
245
+ end
246
+
247
+ class NewlineRemover < AbstractorPiece
248
+ def abstract(tokens)
249
+ tokens.delete_if { |t| t.type == :newline }
250
+ return tokens
251
+ end
252
+ end
253
+
254
+ class CommentRemover < AbstractorPiece
255
+ def abstract(tokens)
256
+ tokens.delete_if { |t| t.type == :comment }
257
+ return tokens
118
258
  end
119
259
  end
120
260
  end
@@ -32,8 +32,7 @@ module CodeLexer
32
32
 
33
33
  parsed['lexer'].each do |name, regexs|
34
34
  regexs.each do |regex|
35
- p regex
36
- regex = Regexp.new("^" + regex)
35
+ regex = Regexp.new("^" + regex, Regexp::MULTILINE)
37
36
  @rules << [name.to_sym, regex]
38
37
  end
39
38
  end
@@ -0,0 +1,38 @@
1
+ lexer:
2
+ keyword:
3
+ - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|double|do|else|eval|false|finally|final|float|for|function|goto|if|implements|int|in|instanceof|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throws|throw|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from|strictfp)
4
+ identifier:
5
+ - "[$A-Za-z_][$A-Za-z0-9_]*"
6
+ comment:
7
+ - \/\/[^\n\r]*(?=[\n\r])
8
+ - \/\/.*$
9
+ - \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
10
+ string:
11
+ - \"([^"]|\\\")*\"
12
+ - \'[^']*\'
13
+ regex:
14
+ - \/([^/]|\\\/)*\/[gim]*
15
+ number:
16
+ - \-?[0-9]*\.[0-9]e\-?[0-9]+
17
+ - \-?[0-9]*\.[0-9]
18
+ - \-?[1-9][0-9]*
19
+ - \-?0[Xx][0-9A-Fa-f]+
20
+ - \-?[0-9]
21
+ - \-?0[0-7]+
22
+ operator:
23
+ - (\=\=\=|\!\=\=)
24
+ - (\<\=|\>\=|\=\=|\!\=|\=\>)
25
+ - (\&\&|\|\||\!)
26
+ - (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
27
+ - (\&|\||\~|\^|\<\<|\>\>)
28
+ - (\=|\+|\-|\/|\*|\%)
29
+ - (\.|\,|\:)
30
+ - (\<|\>|\?)
31
+ parenthesis:
32
+ - (\(|\)|\[|\]|\{|\})
33
+ semicolon:
34
+ - \;
35
+ newline:
36
+ - "[\\n\\r]"
37
+ space:
38
+ - \s+
@@ -1,15 +1,17 @@
1
1
  lexer:
2
2
  keyword:
3
- - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|do|double|else|eval|false|final|finally|float|for|function|goto|if|implements|in|instanceof|int|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throw|throws|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
3
+ - (?:abstract|arguments|boolean|break|byte|case|catch|char|const|continue|debugger|default|delete|double|do|else|eval|false|finally|final|float|for|function|goto|if|implements|int|in|instanceof|interface|let|long|native|new|null|package|private|protected|public|return|short|static|switch|synchronized|this|throws|throw|transient|true|try|typeof|var|void|volatile|while|with|yield|class|enum|export|extends|import|super|from)
4
4
  identifier:
5
5
  - "[$A-Za-z_][$A-Za-z0-9_]*"
6
6
  comment:
7
- - \/\/[^\n\r]*[\n\r]
7
+ - \/\/[^\n\r]*(?=[\n\r])
8
8
  - \/\/.*$
9
9
  - \/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+\/
10
10
  string:
11
11
  - \"([^"]|\\\")*\"
12
12
  - \'[^']*\'
13
+ regex:
14
+ - \/([^/]|\\\/)*\/[gim]*
13
15
  number:
14
16
  - \-?[0-9]*\.[0-9]e\-?[0-9]+
15
17
  - \-?[0-9]*\.[0-9]
@@ -19,13 +21,13 @@ lexer:
19
21
  - \-?0[0-7]+
20
22
  operator:
21
23
  - (\=\=\=|\!\=\=)
22
- - (\<\=|\>\=|\=\=|\!\=)
24
+ - (\<\=|\>\=|\=\=|\!\=|\=\>)
23
25
  - (\&\&|\|\||\!)
24
26
  - (\+\=|\-\=|\/\=|\*\=|\%\=|\+\+|\-\-)
25
27
  - (\&|\||\~|\^|\<\<|\>\>)
26
28
  - (\=|\+|\-|\/|\*|\%)
27
29
  - (\.|\,|\:)
28
- - (\<|\>)
30
+ - (\<|\>|\?)
29
31
  parenthesis:
30
32
  - (\(|\)|\[|\]|\{|\})
31
33
  semicolon:
@@ -12,7 +12,7 @@ module CodeLexer
12
12
  end
13
13
  end
14
14
 
15
- def lex(content)
15
+ def lex(content, abstractor = nil)
16
16
  content = content.clone
17
17
  tokens = []
18
18
  while content.length > 0
@@ -23,17 +23,31 @@ module CodeLexer
23
23
  end
24
24
  end
25
25
 
26
- return LexedContent.new(tokens)
26
+ return LexedContent.new(tokens, abstractor)
27
27
  end
28
28
  end
29
29
 
30
30
  class LexedContent
31
31
  attr_reader :tokens
32
+ attr_reader :abstractor
32
33
 
33
- def initialize(tokens)
34
+ def self.from_stream_string(stream, abstractor)
35
+ tokens = stream.split(" ").map { |t| Token.from_string(t) }
36
+ abstractor.deabstract!(tokens)
37
+ return LexedContent.new(tokens, abstractor)
38
+ end
39
+
40
+ def initialize(tokens, abstractor = nil)
34
41
  @tokens = tokens
42
+ @abstractor = abstractor
43
+
44
+ @abstractor.abstract!(@tokens) if @abstractor
45
+ end
46
+
47
+ def reconstruct
48
+ @tokens.map { |t| t.value.to_s }.join("")
35
49
  end
36
-
50
+
37
51
  def token_lines
38
52
  result = []
39
53
  current_line = []
@@ -53,14 +67,73 @@ module CodeLexer
53
67
  end
54
68
 
55
69
  def token_stream(abstractor = nil)
56
- abstractor.abstract!(@tokens) if abstractor
57
-
58
70
  result = []
59
- @tokens.each do |token|
71
+
72
+ tokens = @tokens
73
+ if abstractor
74
+ tokens = tokens.map { |t| t.clone }
75
+ tokens.each { |t| t.reset_abstraction }
76
+ abstractor.abstract!(tokens)
77
+ end
78
+
79
+ tokens.each do |token|
60
80
  result << token.abstracted_value
61
81
  end
62
82
 
63
83
  return result.join(" ")
64
84
  end
85
+
86
+ def to_s
87
+ @tokens.map { |t| t.value }.join("")
88
+ end
89
+
90
+ def dump(filename, mode = "w", force = false)
91
+ if mode.downcase.include?("w") && !force
92
+ if FileTest.exist?(filename) || FileTest.exist?(lexdata(filename))
93
+ raise "Destination filename or lexdata filename already exist."
94
+ end
95
+ end
96
+
97
+ File.open(filename, mode) do |f|
98
+ f << self.token_stream + "\n"
99
+ end
100
+
101
+ File.open(lexdata(filename), "#{mode}b") do |f|
102
+ f << Marshal.dump(@abstractor)
103
+ end
104
+ end
105
+
106
+ def self.load(file_or_filename, lexdata_or_lexdata_filename = nil)
107
+ if file_or_filename.is_a?(String) && (lexdata_or_lexdata_filename.is_a?(String) || !lexdata_or_lexdata_filename)
108
+ unless lexdata_or_lexdata_filename
109
+ return self.load_filename(file_or_filename)
110
+ else
111
+ return self.load_filename(file_or_filename, lexdata_or_lexdata_filename)
112
+ end
113
+ elsif file_or_filename.is_a?(File) && lexdata_or_lexdata_filename.is_a?(File)
114
+ return self.load_file(file_or_filename, lexdata_or_lexdata_filename)
115
+ else
116
+ raise "Unable to call with the provided input types: expected (String, String), (String), or (File, File)"
117
+ end
118
+ end
119
+
120
+ def self.load_filename(filename, lexdata_filename = filename + ".lexdata")
121
+ File.open(filename, "r") do |file|
122
+ File.open(lexdata_filename, "rb") do |lexdata_file|
123
+ return LexedContent.load_file(file, lexdata_file)
124
+ end
125
+ end
126
+ end
127
+
128
+ def self.load_file(file, lexdata_file)
129
+ line = file.readline
130
+ abstractor = Marshal.load(lexdata_file)
131
+ return LexedContent.from_stream_string(line, abstractor)
132
+ end
133
+
134
+ private
135
+ def lexdata(filename)
136
+ filename + ".lexdata"
137
+ end
65
138
  end
66
139
  end
@@ -11,6 +11,19 @@ module CodeLexer
11
11
  attr_accessor :value
12
12
  attr_accessor :abstracted_value
13
13
 
14
+ def self.from_string(string)
15
+ unless string.start_with?(SPECIAL_TOKEN_OPEN)
16
+ value = string
17
+ else
18
+ value = nil
19
+ end
20
+
21
+ token = Token.new(:unknown, value)
22
+ token.abstracted_value = string
23
+
24
+ return token
25
+ end
26
+
14
27
  def initialize(type, value)
15
28
  @type = type
16
29
  self.value = value
@@ -18,13 +31,7 @@ module CodeLexer
18
31
 
19
32
  def value=(v)
20
33
  @value = v
21
- if @type == :newline
22
- @abstracted_value = Token.special("NEWLINE")
23
- elsif v =~ /\s/
24
- @abstracted_value = Token.special(v.gsub(/\s/, "·"))
25
- else
26
- @abstracted_value = v
27
- end
34
+ self.reset_abstraction
28
35
  end
29
36
 
30
37
  def to_s
@@ -38,5 +45,15 @@ module CodeLexer
38
45
  def ==(oth)
39
46
  @type == oth.type && @value == oth.value && @abstracted_value == oth.abstracted_value
40
47
  end
48
+
49
+ def reset_abstraction
50
+ if @type == :newline
51
+ @abstracted_value = Token.special("NEWLINE")
52
+ elsif @value =~ /\s/
53
+ @abstracted_value = Token.special(@value.gsub(/\s/, "·"))
54
+ else
55
+ @abstracted_value = @value.clone
56
+ end
57
+ end
41
58
  end
42
59
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: code-lexer
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.6'
4
+ version: '0.8'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Simone Scalabrino
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-21 00:00:00.000000000 Z
11
+ date: 2022-11-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: code-assertions
@@ -40,6 +40,7 @@ files:
40
40
  - lib/code-lexer.rb
41
41
  - lib/code-lexer/abstractor.rb
42
42
  - lib/code-lexer/config.rb
43
+ - lib/code-lexer/languages/java.yml
43
44
  - lib/code-lexer/languages/javascript.yml
44
45
  - lib/code-lexer/lexer.rb
45
46
  - lib/code-lexer/token.rb
@@ -62,7 +63,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
63
  - !ruby/object:Gem::Version
63
64
  version: '0'
64
65
  requirements: []
65
- rubygems_version: 3.2.32
66
+ rubygems_version: 3.3.7
66
67
  signing_key:
67
68
  specification_version: 4
68
69
  summary: Simple source code lexer