lex 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.ruby-version +1 -0
- data/.travis.yml +22 -0
- data/Gemfile +19 -0
- data/LICENSE.txt +22 -0
- data/README.md +423 -0
- data/Rakefile +8 -0
- data/lex.gemspec +22 -0
- data/lib/lex.rb +22 -0
- data/lib/lex/lexeme.rb +27 -0
- data/lib/lex/lexer.rb +210 -0
- data/lib/lex/lexer/dsl.rb +49 -0
- data/lib/lex/lexer/rule_dsl.rb +165 -0
- data/lib/lex/lexers.rb +11 -0
- data/lib/lex/lexers/html.rb +8 -0
- data/lib/lex/linter.rb +114 -0
- data/lib/lex/logger.rb +21 -0
- data/lib/lex/source_line.rb +13 -0
- data/lib/lex/state.rb +37 -0
- data/lib/lex/token.rb +47 -0
- data/lib/lex/version.rb +5 -0
- data/spec/spec_helper.rb +50 -0
- data/spec/unit/error_spec.rb +42 -0
- data/spec/unit/keyword_spec.rb +34 -0
- data/spec/unit/lex_spec.rb +60 -0
- data/spec/unit/position_spec.rb +94 -0
- data/spec/unit/rule_spec.rb +63 -0
- data/spec/unit/state/clone_spec.rb +15 -0
- data/spec/unit/states_spec.rb +194 -0
- data/spec/unit/tokens_spec.rb +32 -0
- data/tasks/console.rake +10 -0
- data/tasks/coverage.rake +11 -0
- data/tasks/spec.rake +29 -0
- metadata +104 -0
data/Rakefile
ADDED
data/lex.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'lex/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "lex"
|
8
|
+
spec.version = Lex::VERSION
|
9
|
+
spec.authors = ["Piotr Murach"]
|
10
|
+
spec.email = [""]
|
11
|
+
spec.summary = %q{Lex is an implementation of complier constuction tool lex in Ruby.}
|
12
|
+
spec.description = %q{Lex is an implementation of compiler construction tool lex in Ruby. The goal is to stay close to the way the original tool works and combine it with the expressivness or Ruby.}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
22
|
+
end
|
data/lib/lex.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'strscan'
|
4
|
+
require 'logger'
|
5
|
+
require 'forwardable'
|
6
|
+
|
7
|
+
require 'lex/logger'
|
8
|
+
require 'lex/linter'
|
9
|
+
require 'lex/lexeme'
|
10
|
+
require 'lex/source_line'
|
11
|
+
require 'lex/state'
|
12
|
+
require 'lex/token'
|
13
|
+
require 'lex/lexer'
|
14
|
+
require "lex/version"
|
15
|
+
|
16
|
+
module Lex
|
17
|
+
# A base class for all Lexer errors
|
18
|
+
class Error < StandardError; end
|
19
|
+
|
20
|
+
# Raised when lexing fails
|
21
|
+
class LexerError < Error; end
|
22
|
+
end # Lex
|
data/lib/lex/lexeme.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module Lex
|
4
|
+
# Represents token definition
|
5
|
+
class Lexeme
|
6
|
+
attr_reader :name, :pattern, :action
|
7
|
+
|
8
|
+
def initialize(name, pattern, &action)
|
9
|
+
@name = name
|
10
|
+
@pattern = pattern
|
11
|
+
@action = action
|
12
|
+
end
|
13
|
+
|
14
|
+
def match(scanner)
|
15
|
+
match = scanner.check(pattern)
|
16
|
+
if match
|
17
|
+
return Token.new(name, match.to_s, &action)
|
18
|
+
end
|
19
|
+
match
|
20
|
+
end
|
21
|
+
|
22
|
+
# @api public
|
23
|
+
def ==(other)
|
24
|
+
@name == other.name
|
25
|
+
end
|
26
|
+
end # Lexeme
|
27
|
+
end # Lex
|
data/lib/lex/lexer.rb
ADDED
@@ -0,0 +1,210 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'lex/lexer/dsl'
|
4
|
+
|
5
|
+
module Lex
|
6
|
+
# An abstract lexer that doesn't provide any lexing rules.
|
7
|
+
#
|
8
|
+
# @api public
|
9
|
+
class Lexer
|
10
|
+
extend Forwardable
|
11
|
+
include DSL
|
12
|
+
|
13
|
+
attr_reader :input,
|
14
|
+
:logger,
|
15
|
+
:debug,
|
16
|
+
:current_state,
|
17
|
+
:current_line
|
18
|
+
|
19
|
+
def_delegators :@dsl,
|
20
|
+
:lex_tokens,
|
21
|
+
:state_info,
|
22
|
+
:state_re,
|
23
|
+
:state_names,
|
24
|
+
:state_ignore,
|
25
|
+
:state_error,
|
26
|
+
:state_lexemes
|
27
|
+
|
28
|
+
def initialize(options = {}, &block)
|
29
|
+
@current_line = 1
|
30
|
+
@current_pos = 1 # Position in input
|
31
|
+
@char_pos_in_line = 0
|
32
|
+
@current_state = :initial
|
33
|
+
@state_stack = []
|
34
|
+
@logger = Lex::Logger.new
|
35
|
+
@linter = Lex::Linter.new
|
36
|
+
@debug = options[:debug]
|
37
|
+
@dsl = self.class.dsl
|
38
|
+
|
39
|
+
@dsl.instance_eval(&block) if block
|
40
|
+
@linter.lint(self)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Tokenizes input and returns all tokens
|
44
|
+
#
|
45
|
+
# @param [String] input
|
46
|
+
#
|
47
|
+
# @return [Enumerator]
|
48
|
+
# the tokens found
|
49
|
+
#
|
50
|
+
# @api public
|
51
|
+
def lex(input)
|
52
|
+
@input = input
|
53
|
+
|
54
|
+
return enum_for(:lex, input) unless block_given?
|
55
|
+
|
56
|
+
if debug
|
57
|
+
logger.info "lex: tokens = #{@dsl.lex_tokens}"
|
58
|
+
logger.info "lex: states = #{@dsl.state_info}"
|
59
|
+
logger.info "lex: ignore = #{@dsl.state_ignore}"
|
60
|
+
logger.info "lex: error = #{@dsl.state_error}"
|
61
|
+
end
|
62
|
+
|
63
|
+
stream_tokens(input) do |token|
|
64
|
+
yield token
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Advances through input and streams tokens
|
69
|
+
#
|
70
|
+
# @param [String] input
|
71
|
+
#
|
72
|
+
# @yield [Lex::Token]
|
73
|
+
#
|
74
|
+
# @api public
|
75
|
+
def stream_tokens(input, &block)
|
76
|
+
scanner = StringScanner.new(input)
|
77
|
+
while !scanner.eos?
|
78
|
+
current_char = scanner.peek(1)
|
79
|
+
if @dsl.state_ignore[current_state].include?(current_char)
|
80
|
+
scanner.pos += current_char.size
|
81
|
+
@char_pos_in_line += current_char.size
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
if debug
|
86
|
+
logger.info "lex: [#{current_state}]: lexemes = #{@dsl.state_lexemes[current_state].map(&:name)}"
|
87
|
+
end
|
88
|
+
# Look for regex match
|
89
|
+
longest_token = nil
|
90
|
+
@dsl.state_lexemes[current_state].each do |lexeme|
|
91
|
+
match = lexeme.match(scanner)
|
92
|
+
next if match.nil?
|
93
|
+
longest_token = match if longest_token.nil?
|
94
|
+
next if longest_token.value.length >= match.value.length
|
95
|
+
longest_token = match
|
96
|
+
end
|
97
|
+
|
98
|
+
if longest_token
|
99
|
+
if longest_token.action
|
100
|
+
new_token = longest_token.action.call(self, longest_token)
|
101
|
+
# No value returned from action move to the next token
|
102
|
+
if new_token.nil? || !new_token.is_a?(Token)
|
103
|
+
chars_to_skip = longest_token.value.to_s.length
|
104
|
+
scanner.pos += chars_to_skip
|
105
|
+
unless longest_token.name == :newline
|
106
|
+
@char_pos_in_line += chars_to_skip
|
107
|
+
end
|
108
|
+
next
|
109
|
+
end
|
110
|
+
end
|
111
|
+
move_by = longest_token.value.to_s.length
|
112
|
+
start_char_pos_in_token = @char_pos_in_line + current_char.size
|
113
|
+
longest_token.update_line(current_line, start_char_pos_in_token)
|
114
|
+
advance_column(move_by)
|
115
|
+
scanner.pos += move_by
|
116
|
+
end
|
117
|
+
|
118
|
+
# No match
|
119
|
+
if longest_token.nil?
|
120
|
+
# Check in errors
|
121
|
+
if @dsl.state_error[current_state]
|
122
|
+
token = Token.new(:error, current_char)
|
123
|
+
start_char_pos_in_token = @char_pos_in_line + current_char.size
|
124
|
+
token.update_line(current_line, start_char_pos_in_token)
|
125
|
+
new_token = @dsl.state_error[current_state].call(self, token)
|
126
|
+
advance_column(current_char.length)
|
127
|
+
scanner.pos += current_char.length
|
128
|
+
if new_token.is_a?(Token) || !new_token.nil?
|
129
|
+
longest_token = new_token
|
130
|
+
else
|
131
|
+
next
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
if longest_token.nil?
|
136
|
+
complain("Illegal character `#{current_char}`")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
logger.info "lex: #{longest_token}" if debug
|
141
|
+
block.call(longest_token)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Switches the state
|
146
|
+
#
|
147
|
+
# @param [Symbol] state
|
148
|
+
# the name of the state
|
149
|
+
#
|
150
|
+
# @api public
|
151
|
+
def begin(state)
|
152
|
+
unless @dsl.state_info.key?(state)
|
153
|
+
complain("Undefined state: #{state}")
|
154
|
+
end
|
155
|
+
@current_state = state
|
156
|
+
end
|
157
|
+
|
158
|
+
# Enter new state and save old one on stack
|
159
|
+
#
|
160
|
+
# @param [Symbol] state
|
161
|
+
# the name of the state
|
162
|
+
#
|
163
|
+
# @api public
|
164
|
+
def push_state(state)
|
165
|
+
@state_stack << @current_state
|
166
|
+
self.begin(state)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Restore previous state
|
170
|
+
#
|
171
|
+
# @param [Symbol] state
|
172
|
+
# the name of the state
|
173
|
+
#
|
174
|
+
# @api public
|
175
|
+
def pop_state
|
176
|
+
self.begin(@state_stack.pop)
|
177
|
+
end
|
178
|
+
|
179
|
+
# Skip ahead n characters
|
180
|
+
#
|
181
|
+
# @api public
|
182
|
+
def skip(n)
|
183
|
+
@current_pos += n
|
184
|
+
end
|
185
|
+
|
186
|
+
def advance_line(value)
|
187
|
+
@current_line += value
|
188
|
+
@char_pos_in_line = 0
|
189
|
+
end
|
190
|
+
|
191
|
+
def advance_column(value)
|
192
|
+
@char_pos_in_line += value
|
193
|
+
end
|
194
|
+
|
195
|
+
# Reset the internal state of the lexer
|
196
|
+
# @api public
|
197
|
+
def rewind
|
198
|
+
@line = 1
|
199
|
+
@column = 1
|
200
|
+
@stack = []
|
201
|
+
end
|
202
|
+
|
203
|
+
private
|
204
|
+
|
205
|
+
# @api private
|
206
|
+
def complain(*args)
|
207
|
+
raise LexerError, *args
|
208
|
+
end
|
209
|
+
end # Lexer
|
210
|
+
end # Lex
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'lex/lexer/rule_dsl'
|
4
|
+
|
5
|
+
module Lex
|
6
|
+
class Lexer
|
7
|
+
# Lexer DSL
|
8
|
+
module DSL
|
9
|
+
# Extend lexer class with DSL methods
|
10
|
+
#
|
11
|
+
# @api private
|
12
|
+
def self.included(klass)
|
13
|
+
klass.extend(ClassMethods)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Class methods for a lexer
|
17
|
+
#
|
18
|
+
# @api private
|
19
|
+
module ClassMethods
|
20
|
+
# Set dsl for lexer
|
21
|
+
#
|
22
|
+
# @api private
|
23
|
+
def inherited(klass)
|
24
|
+
super
|
25
|
+
|
26
|
+
klass.instance_variable_set('@dsl', nil)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Return the rule DSL used by Lexer
|
30
|
+
#
|
31
|
+
# @api private
|
32
|
+
def dsl
|
33
|
+
@dsl ||= RuleDSL.new
|
34
|
+
end
|
35
|
+
|
36
|
+
# Delegate calls to RuleDSL
|
37
|
+
#
|
38
|
+
# @api private
|
39
|
+
def method_missing(name, *args, &block)
|
40
|
+
if dsl.respond_to?(name)
|
41
|
+
dsl.public_send(name, *args, &block)
|
42
|
+
else
|
43
|
+
super
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end # ClassMethods
|
47
|
+
end # DSL
|
48
|
+
end # Lexer
|
49
|
+
end # Lex
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module Lex
|
4
|
+
class Lexer
|
5
|
+
# Rules DSL used internally by {Lexer}
|
6
|
+
#
|
7
|
+
# @api private
|
8
|
+
class RuleDSL
|
9
|
+
attr_reader :lex_tokens,
|
10
|
+
:state_info,
|
11
|
+
:state_re,
|
12
|
+
:state_names,
|
13
|
+
:state_ignore,
|
14
|
+
:state_error,
|
15
|
+
:state_lexemes
|
16
|
+
|
17
|
+
# @api private
|
18
|
+
def initialize
|
19
|
+
@state_info = { initial: :inclusive }
|
20
|
+
@state_ignore = { initial: '' } # Ignored characters for each state
|
21
|
+
@state_error = {} # Error conditions for each state
|
22
|
+
@state_re = Hash.new { |hash, name| hash[name] = {}} # Regexes for each state
|
23
|
+
@state_names = {} # Symbol names for each state
|
24
|
+
@state_lexemes = Hash.new { |hash, name| hash[name] = State.new(name) }
|
25
|
+
@lex_tokens = [] # List of valid tokens
|
26
|
+
end
|
27
|
+
|
28
|
+
# Add tokens to lexer
|
29
|
+
#
|
30
|
+
# @api public
|
31
|
+
def tokens(*value)
|
32
|
+
@lex_tokens = value
|
33
|
+
end
|
34
|
+
|
35
|
+
# Add states to lexer
|
36
|
+
#
|
37
|
+
# @api public
|
38
|
+
def states(value)
|
39
|
+
@state_info.merge!(value)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Specify lexing rule
|
43
|
+
#
|
44
|
+
# @param [Symbol] name
|
45
|
+
# the rule name
|
46
|
+
#
|
47
|
+
# @param [Regex] pattern
|
48
|
+
# the regex pattern
|
49
|
+
#
|
50
|
+
# @api public
|
51
|
+
def rule(name, pattern, &action)
|
52
|
+
state_names, token_name = *extract_state_token(name)
|
53
|
+
if token_name =~ /^[[:upper:]]*$/ && !@lex_tokens.include?(token_name)
|
54
|
+
complain("Rule '#{name}' defined for" \
|
55
|
+
" an unspecified token #{token_name}")
|
56
|
+
end
|
57
|
+
state_names.each do |state_name|
|
58
|
+
state = @state_lexemes[state_name]
|
59
|
+
state << Lexeme.new(token_name, pattern, &action)
|
60
|
+
end
|
61
|
+
update_inclusive_states
|
62
|
+
state_names.each do |state_name|
|
63
|
+
if @state_re[state_name].key?(token_name)
|
64
|
+
complain("Rule '#{name}' redefined.")
|
65
|
+
end
|
66
|
+
@state_re[state_name][token_name] = pattern
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Define ignore condition for a state
|
71
|
+
#
|
72
|
+
# @param [Symbol] states
|
73
|
+
# the optional state names
|
74
|
+
#
|
75
|
+
# @param [String] value
|
76
|
+
# the characters to ignore
|
77
|
+
#
|
78
|
+
# @api public
|
79
|
+
def ignore(states, value = (not_set = true))
|
80
|
+
if not_set
|
81
|
+
value = states
|
82
|
+
state_names = [:initial]
|
83
|
+
else
|
84
|
+
state_names = states.to_s.split('_').map(&:to_sym)
|
85
|
+
end
|
86
|
+
if !value.is_a?(String)
|
87
|
+
logger.error("Ignore rule '#{value}' has to be defined with a string")
|
88
|
+
end
|
89
|
+
state_names.each do |state_name|
|
90
|
+
@state_ignore[state_name] = value
|
91
|
+
end
|
92
|
+
@state_info.each do |state_name, state_type|
|
93
|
+
if state_name != :initial && state_type == :inclusive
|
94
|
+
if !@state_ignore.key?(state_name)
|
95
|
+
@state_ignore[state_name] = @state_ignore[:initial]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Define error condition for a state
|
102
|
+
#
|
103
|
+
# @api public
|
104
|
+
def error(states = :initial, &action)
|
105
|
+
state_names = states.to_s.split('_').map(&:to_sym)
|
106
|
+
state_names.each do |state_name|
|
107
|
+
@state_error[state_name] = action
|
108
|
+
end
|
109
|
+
@state_info.each do |state_name, state_type|
|
110
|
+
if state_name != :initial && state_type == :inclusive
|
111
|
+
if !@state_error.key?(state_name)
|
112
|
+
@state_error[state_name] = @state_error[:initial]
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
private
|
119
|
+
|
120
|
+
# For inclusive states copy over initial state rules
|
121
|
+
#
|
122
|
+
# @api private
|
123
|
+
def update_inclusive_states
|
124
|
+
@state_info.each do |state_name, state_type|
|
125
|
+
if state_name != :initial && state_type == :inclusive
|
126
|
+
initial_state = @state_lexemes[:initial]
|
127
|
+
@state_lexemes[state_name].update(initial_state.lexemes)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Extract tuple of state names and token name
|
133
|
+
#
|
134
|
+
# @param [Symbol] name
|
135
|
+
# the rule name
|
136
|
+
#
|
137
|
+
# @return [Array[Symbol], Symbol]
|
138
|
+
# returns tuples [states, token]
|
139
|
+
#
|
140
|
+
# @api private
|
141
|
+
def extract_state_token(name)
|
142
|
+
parts = name.to_s.split('_')
|
143
|
+
state_i = 0
|
144
|
+
parts.each_with_index do |part, i|
|
145
|
+
if !@state_info.keys.include?(part.to_sym)
|
146
|
+
state_i = i
|
147
|
+
break
|
148
|
+
end
|
149
|
+
end
|
150
|
+
states = if state_i > 0
|
151
|
+
parts[0...state_i].map(&:to_sym)
|
152
|
+
else
|
153
|
+
[:initial]
|
154
|
+
end
|
155
|
+
token_name = parts[state_i..-1].join('_').to_sym
|
156
|
+
[states, token_name]
|
157
|
+
end
|
158
|
+
|
159
|
+
# @api private
|
160
|
+
def complain(*args)
|
161
|
+
raise LexerError, *args
|
162
|
+
end
|
163
|
+
end # RuleDSL
|
164
|
+
end # Lexer
|
165
|
+
end # Lex
|