lex 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ # coding: utf-8
2
+
3
+ require 'bundler/gem_tasks'
4
+
5
+ FileList['tasks/**/*.rake'].each(&method(:import))
6
+
7
+ desc 'Run all specs'
8
+ task ci: %w[ spec ]
@@ -0,0 +1,22 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'lex/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "lex"
8
+ spec.version = Lex::VERSION
9
+ spec.authors = ["Piotr Murach"]
10
+ spec.email = [""]
11
+ spec.summary = %q{Lex is an implementation of complier constuction tool lex in Ruby.}
12
+ spec.description = %q{Lex is an implementation of compiler construction tool lex in Ruby. The goal is to stay close to the way the original tool works and combine it with the expressivness or Ruby.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ end
@@ -0,0 +1,22 @@
1
+ # coding: utf-8
2
+
3
+ require 'strscan'
4
+ require 'logger'
5
+ require 'forwardable'
6
+
7
+ require 'lex/logger'
8
+ require 'lex/linter'
9
+ require 'lex/lexeme'
10
+ require 'lex/source_line'
11
+ require 'lex/state'
12
+ require 'lex/token'
13
+ require 'lex/lexer'
14
+ require "lex/version"
15
+
16
+ module Lex
17
+ # A base class for all Lexer errors
18
+ class Error < StandardError; end
19
+
20
+ # Raised when lexing fails
21
+ class LexerError < Error; end
22
+ end # Lex
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+
3
+ module Lex
4
+ # Represents token definition
5
+ class Lexeme
6
+ attr_reader :name, :pattern, :action
7
+
8
+ def initialize(name, pattern, &action)
9
+ @name = name
10
+ @pattern = pattern
11
+ @action = action
12
+ end
13
+
14
+ def match(scanner)
15
+ match = scanner.check(pattern)
16
+ if match
17
+ return Token.new(name, match.to_s, &action)
18
+ end
19
+ match
20
+ end
21
+
22
+ # @api public
23
+ def ==(other)
24
+ @name == other.name
25
+ end
26
+ end # Lexeme
27
+ end # Lex
@@ -0,0 +1,210 @@
1
+ # coding: utf-8
2
+
3
+ require 'lex/lexer/dsl'
4
+
5
+ module Lex
6
+ # An abstract lexer that doesn't provide any lexing rules.
7
+ #
8
+ # @api public
9
+ class Lexer
10
+ extend Forwardable
11
+ include DSL
12
+
13
+ attr_reader :input,
14
+ :logger,
15
+ :debug,
16
+ :current_state,
17
+ :current_line
18
+
19
+ def_delegators :@dsl,
20
+ :lex_tokens,
21
+ :state_info,
22
+ :state_re,
23
+ :state_names,
24
+ :state_ignore,
25
+ :state_error,
26
+ :state_lexemes
27
+
28
+ def initialize(options = {}, &block)
29
+ @current_line = 1
30
+ @current_pos = 1 # Position in input
31
+ @char_pos_in_line = 0
32
+ @current_state = :initial
33
+ @state_stack = []
34
+ @logger = Lex::Logger.new
35
+ @linter = Lex::Linter.new
36
+ @debug = options[:debug]
37
+ @dsl = self.class.dsl
38
+
39
+ @dsl.instance_eval(&block) if block
40
+ @linter.lint(self)
41
+ end
42
+
43
+ # Tokenizes input and returns all tokens
44
+ #
45
+ # @param [String] input
46
+ #
47
+ # @return [Enumerator]
48
+ # the tokens found
49
+ #
50
+ # @api public
51
+ def lex(input)
52
+ @input = input
53
+
54
+ return enum_for(:lex, input) unless block_given?
55
+
56
+ if debug
57
+ logger.info "lex: tokens = #{@dsl.lex_tokens}"
58
+ logger.info "lex: states = #{@dsl.state_info}"
59
+ logger.info "lex: ignore = #{@dsl.state_ignore}"
60
+ logger.info "lex: error = #{@dsl.state_error}"
61
+ end
62
+
63
+ stream_tokens(input) do |token|
64
+ yield token
65
+ end
66
+ end
67
+
68
+ # Advances through input and streams tokens
69
+ #
70
+ # @param [String] input
71
+ #
72
+ # @yield [Lex::Token]
73
+ #
74
+ # @api public
75
+ def stream_tokens(input, &block)
76
+ scanner = StringScanner.new(input)
77
+ while !scanner.eos?
78
+ current_char = scanner.peek(1)
79
+ if @dsl.state_ignore[current_state].include?(current_char)
80
+ scanner.pos += current_char.size
81
+ @char_pos_in_line += current_char.size
82
+ next
83
+ end
84
+
85
+ if debug
86
+ logger.info "lex: [#{current_state}]: lexemes = #{@dsl.state_lexemes[current_state].map(&:name)}"
87
+ end
88
+ # Look for regex match
89
+ longest_token = nil
90
+ @dsl.state_lexemes[current_state].each do |lexeme|
91
+ match = lexeme.match(scanner)
92
+ next if match.nil?
93
+ longest_token = match if longest_token.nil?
94
+ next if longest_token.value.length >= match.value.length
95
+ longest_token = match
96
+ end
97
+
98
+ if longest_token
99
+ if longest_token.action
100
+ new_token = longest_token.action.call(self, longest_token)
101
+ # No value returned from action move to the next token
102
+ if new_token.nil? || !new_token.is_a?(Token)
103
+ chars_to_skip = longest_token.value.to_s.length
104
+ scanner.pos += chars_to_skip
105
+ unless longest_token.name == :newline
106
+ @char_pos_in_line += chars_to_skip
107
+ end
108
+ next
109
+ end
110
+ end
111
+ move_by = longest_token.value.to_s.length
112
+ start_char_pos_in_token = @char_pos_in_line + current_char.size
113
+ longest_token.update_line(current_line, start_char_pos_in_token)
114
+ advance_column(move_by)
115
+ scanner.pos += move_by
116
+ end
117
+
118
+ # No match
119
+ if longest_token.nil?
120
+ # Check in errors
121
+ if @dsl.state_error[current_state]
122
+ token = Token.new(:error, current_char)
123
+ start_char_pos_in_token = @char_pos_in_line + current_char.size
124
+ token.update_line(current_line, start_char_pos_in_token)
125
+ new_token = @dsl.state_error[current_state].call(self, token)
126
+ advance_column(current_char.length)
127
+ scanner.pos += current_char.length
128
+ if new_token.is_a?(Token) || !new_token.nil?
129
+ longest_token = new_token
130
+ else
131
+ next
132
+ end
133
+ end
134
+
135
+ if longest_token.nil?
136
+ complain("Illegal character `#{current_char}`")
137
+ end
138
+ end
139
+
140
+ logger.info "lex: #{longest_token}" if debug
141
+ block.call(longest_token)
142
+ end
143
+ end
144
+
145
+ # Switches the state
146
+ #
147
+ # @param [Symbol] state
148
+ # the name of the state
149
+ #
150
+ # @api public
151
+ def begin(state)
152
+ unless @dsl.state_info.key?(state)
153
+ complain("Undefined state: #{state}")
154
+ end
155
+ @current_state = state
156
+ end
157
+
158
+ # Enter new state and save old one on stack
159
+ #
160
+ # @param [Symbol] state
161
+ # the name of the state
162
+ #
163
+ # @api public
164
+ def push_state(state)
165
+ @state_stack << @current_state
166
+ self.begin(state)
167
+ end
168
+
169
+ # Restore previous state
170
+ #
171
+ # @param [Symbol] state
172
+ # the name of the state
173
+ #
174
+ # @api public
175
+ def pop_state
176
+ self.begin(@state_stack.pop)
177
+ end
178
+
179
+ # Skip ahead n characters
180
+ #
181
+ # @api public
182
+ def skip(n)
183
+ @current_pos += n
184
+ end
185
+
186
+ def advance_line(value)
187
+ @current_line += value
188
+ @char_pos_in_line = 0
189
+ end
190
+
191
+ def advance_column(value)
192
+ @char_pos_in_line += value
193
+ end
194
+
195
+ # Reset the internal state of the lexer
196
+ # @api public
197
+ def rewind
198
+ @line = 1
199
+ @column = 1
200
+ @stack = []
201
+ end
202
+
203
+ private
204
+
205
+ # @api private
206
+ def complain(*args)
207
+ raise LexerError, *args
208
+ end
209
+ end # Lexer
210
+ end # Lex
@@ -0,0 +1,49 @@
1
+ # coding: utf-8
2
+
3
+ require 'lex/lexer/rule_dsl'
4
+
5
+ module Lex
6
+ class Lexer
7
+ # Lexer DSL
8
+ module DSL
9
+ # Extend lexer class with DSL methods
10
+ #
11
+ # @api private
12
+ def self.included(klass)
13
+ klass.extend(ClassMethods)
14
+ end
15
+
16
+ # Class methods for a lexer
17
+ #
18
+ # @api private
19
+ module ClassMethods
20
+ # Set dsl for lexer
21
+ #
22
+ # @api private
23
+ def inherited(klass)
24
+ super
25
+
26
+ klass.instance_variable_set('@dsl', nil)
27
+ end
28
+
29
+ # Return the rule DSL used by Lexer
30
+ #
31
+ # @api private
32
+ def dsl
33
+ @dsl ||= RuleDSL.new
34
+ end
35
+
36
+ # Delegate calls to RuleDSL
37
+ #
38
+ # @api private
39
+ def method_missing(name, *args, &block)
40
+ if dsl.respond_to?(name)
41
+ dsl.public_send(name, *args, &block)
42
+ else
43
+ super
44
+ end
45
+ end
46
+ end # ClassMethods
47
+ end # DSL
48
+ end # Lexer
49
+ end # Lex
@@ -0,0 +1,165 @@
1
+ # coding: utf-8
2
+
3
+ module Lex
4
+ class Lexer
5
+ # Rules DSL used internally by {Lexer}
6
+ #
7
+ # @api private
8
+ class RuleDSL
9
+ attr_reader :lex_tokens,
10
+ :state_info,
11
+ :state_re,
12
+ :state_names,
13
+ :state_ignore,
14
+ :state_error,
15
+ :state_lexemes
16
+
17
+ # @api private
18
+ def initialize
19
+ @state_info = { initial: :inclusive }
20
+ @state_ignore = { initial: '' } # Ignored characters for each state
21
+ @state_error = {} # Error conditions for each state
22
+ @state_re = Hash.new { |hash, name| hash[name] = {}} # Regexes for each state
23
+ @state_names = {} # Symbol names for each state
24
+ @state_lexemes = Hash.new { |hash, name| hash[name] = State.new(name) }
25
+ @lex_tokens = [] # List of valid tokens
26
+ end
27
+
28
+ # Add tokens to lexer
29
+ #
30
+ # @api public
31
+ def tokens(*value)
32
+ @lex_tokens = value
33
+ end
34
+
35
+ # Add states to lexer
36
+ #
37
+ # @api public
38
+ def states(value)
39
+ @state_info.merge!(value)
40
+ end
41
+
42
+ # Specify lexing rule
43
+ #
44
+ # @param [Symbol] name
45
+ # the rule name
46
+ #
47
+ # @param [Regex] pattern
48
+ # the regex pattern
49
+ #
50
+ # @api public
51
+ def rule(name, pattern, &action)
52
+ state_names, token_name = *extract_state_token(name)
53
+ if token_name =~ /^[[:upper:]]*$/ && !@lex_tokens.include?(token_name)
54
+ complain("Rule '#{name}' defined for" \
55
+ " an unspecified token #{token_name}")
56
+ end
57
+ state_names.each do |state_name|
58
+ state = @state_lexemes[state_name]
59
+ state << Lexeme.new(token_name, pattern, &action)
60
+ end
61
+ update_inclusive_states
62
+ state_names.each do |state_name|
63
+ if @state_re[state_name].key?(token_name)
64
+ complain("Rule '#{name}' redefined.")
65
+ end
66
+ @state_re[state_name][token_name] = pattern
67
+ end
68
+ end
69
+
70
+ # Define ignore condition for a state
71
+ #
72
+ # @param [Symbol] states
73
+ # the optional state names
74
+ #
75
+ # @param [String] value
76
+ # the characters to ignore
77
+ #
78
+ # @api public
79
+ def ignore(states, value = (not_set = true))
80
+ if not_set
81
+ value = states
82
+ state_names = [:initial]
83
+ else
84
+ state_names = states.to_s.split('_').map(&:to_sym)
85
+ end
86
+ if !value.is_a?(String)
87
+ logger.error("Ignore rule '#{value}' has to be defined with a string")
88
+ end
89
+ state_names.each do |state_name|
90
+ @state_ignore[state_name] = value
91
+ end
92
+ @state_info.each do |state_name, state_type|
93
+ if state_name != :initial && state_type == :inclusive
94
+ if !@state_ignore.key?(state_name)
95
+ @state_ignore[state_name] = @state_ignore[:initial]
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ # Define error condition for a state
102
+ #
103
+ # @api public
104
+ def error(states = :initial, &action)
105
+ state_names = states.to_s.split('_').map(&:to_sym)
106
+ state_names.each do |state_name|
107
+ @state_error[state_name] = action
108
+ end
109
+ @state_info.each do |state_name, state_type|
110
+ if state_name != :initial && state_type == :inclusive
111
+ if !@state_error.key?(state_name)
112
+ @state_error[state_name] = @state_error[:initial]
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ private
119
+
120
+ # For inclusive states copy over initial state rules
121
+ #
122
+ # @api private
123
+ def update_inclusive_states
124
+ @state_info.each do |state_name, state_type|
125
+ if state_name != :initial && state_type == :inclusive
126
+ initial_state = @state_lexemes[:initial]
127
+ @state_lexemes[state_name].update(initial_state.lexemes)
128
+ end
129
+ end
130
+ end
131
+
132
+ # Extract tuple of state names and token name
133
+ #
134
+ # @param [Symbol] name
135
+ # the rule name
136
+ #
137
+ # @return [Array[Symbol], Symbol]
138
+ # returns tuples [states, token]
139
+ #
140
+ # @api private
141
+ def extract_state_token(name)
142
+ parts = name.to_s.split('_')
143
+ state_i = 0
144
+ parts.each_with_index do |part, i|
145
+ if !@state_info.keys.include?(part.to_sym)
146
+ state_i = i
147
+ break
148
+ end
149
+ end
150
+ states = if state_i > 0
151
+ parts[0...state_i].map(&:to_sym)
152
+ else
153
+ [:initial]
154
+ end
155
+ token_name = parts[state_i..-1].join('_').to_sym
156
+ [states, token_name]
157
+ end
158
+
159
+ # @api private
160
+ def complain(*args)
161
+ raise LexerError, *args
162
+ end
163
+ end # RuleDSL
164
+ end # Lexer
165
+ end # Lex