rlex 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +74 -0
- data/Rakefile +2 -0
- data/lib/rlex.rb +8 -0
- data/lib/rlex/lexer.rb +172 -0
- data/lib/rlex/token.rb +16 -0
- data/lib/rlex/version.rb +4 -0
- data/rlex.gemspec +20 -0
- data/spec/rlex/lexer_spec.rb +69 -0
- data/spec/spec_helper.rb +11 -0
- metadata +69 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Rasmus Borgsmidt
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Rlex
|
2
|
+
|
3
|
+
Provides a simple lexer based on the <tt>StringScanner</tt> class.
|
4
|
+
|
5
|
+
The lexer was written for use with Racc, a Ruby variant of
|
6
|
+
Yacc. But there is no code dependency on that project so the
|
7
|
+
lexer may also be used on its own or with other packages.
|
8
|
+
|
9
|
+
* Ignored input takes precedence over rules and keywords, so if
|
10
|
+
a prefix is matched by an ignore pattern, it's ignored even if
|
11
|
+
it's also a keyword or matched by a rule
|
12
|
+
* The lexer is greedy, so if a prefix is matched by multiple
|
13
|
+
rules or keywords, the lexer chooses the option consuming the
|
14
|
+
most input
|
15
|
+
|
16
|
+
## Documentation
|
17
|
+
|
18
|
+
Find out more about this project on [GitHub][gh_rlex],
|
19
|
+
[RubyGems][rg_rlex] and [RubyDoc][rd_rlex].
|
20
|
+
|
21
|
+
[gh_rlex]: https://github.com/borgsmidt/rlex
|
22
|
+
[rg_rlex]: http://rubygems.org/gems/rlex
|
23
|
+
[rd_rlex]: http://rubydoc.info/gems/rlex
|
24
|
+
|
25
|
+
## Installation
|
26
|
+
|
27
|
+
Install the gem using:
|
28
|
+
|
29
|
+
$ gem install rlex
|
30
|
+
|
31
|
+
Alternatively, add this line to your application's Gemfile:
|
32
|
+
|
33
|
+
gem 'rlex'
|
34
|
+
|
35
|
+
Then execute:
|
36
|
+
|
37
|
+
$ bundle
|
38
|
+
|
39
|
+
## Usage
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
# Define behavior
|
43
|
+
lexer = Lexer.new
|
44
|
+
lexer.ignore /\s+/ # ignore whitespace
|
45
|
+
lexer.rule :word, /\w+/ # consider any text a 'word'
|
46
|
+
lexer.keyword :if # treat 'if' as a special keyword
|
47
|
+
lexer.keyword :lparen, "(" # any fixed input such as parentheses
|
48
|
+
lexer.keyword :rparen, ")" # may be defined as keywords
|
49
|
+
|
50
|
+
# Initialize with input
|
51
|
+
lexer.start "if ( foo ) bar" # initialize the lexer with a string
|
52
|
+
|
53
|
+
# Iterate through tokens
|
54
|
+
lexer.next_token # => Token (type = :if, value = 'if' )
|
55
|
+
lexer.next_token # => Token (type = :lparen, value = '(' )
|
56
|
+
lexer.next_token # => Token (type = :word, value = 'foo')
|
57
|
+
lexer.next_token # => Token (type = :rparen, value = ')' )
|
58
|
+
lexer.next_token # => Token (type = :word, value = 'bar')
|
59
|
+
lexer.next_token # => EOF_TOKEN
|
60
|
+
```
|
61
|
+
|
62
|
+
## Contributing
|
63
|
+
|
64
|
+
1. Fork it on [GitHub][gh_rlex]
|
65
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
66
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
67
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
68
|
+
5. Create new Pull Request
|
69
|
+
|
70
|
+
## License
|
71
|
+
|
72
|
+
Written by Rasmus Borgsmidt <<rasmus@borgsmidt.dk>>
|
73
|
+
|
74
|
+
Released under the MIT license: www.opensource.org/licenses/MIT
|
data/Rakefile
ADDED
data/lib/rlex.rb
ADDED
data/lib/rlex/lexer.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require 'rlex/token'
|
3
|
+
|
4
|
+
module Rlex
|
5
|
+
# Implements a simple lexer using a <tt>StringScanner</tt>.
|
6
|
+
#
|
7
|
+
# The lexer was written for use with Racc, a Ruby variant of
|
8
|
+
# Yacc. But there is no code dependency on that project so the
|
9
|
+
# lexer may also be used on its own or with other packages.
|
10
|
+
#
|
11
|
+
# * Ignored input takes precedence over rules and keywords, so if
|
12
|
+
# a prefix is matched by an ignore pattern, it's ignored even if
|
13
|
+
# it's also a keyword or matched by a rule
|
14
|
+
# * The lexer is greedy, so if a prefix is matched by multiple
|
15
|
+
# rules or keywords, the lexer chooses the option consuming the
|
16
|
+
# most input
|
17
|
+
#
|
18
|
+
# @author Rasmus Borgsmidt <rasmus@borgsmidt.dk>
|
19
|
+
#
|
20
|
+
# @example Basic usage
|
21
|
+
# # Define behavior
|
22
|
+
# lexer = Lexer.new
|
23
|
+
# lexer.ignore /\s+/ # ignore whitespace
|
24
|
+
# lexer.rule :word, /\w+/ # consider any text a 'word'
|
25
|
+
# lexer.keyword :if # treat 'if' as a special keyword
|
26
|
+
# lexer.keyword :lparen, "(" # any fixed input such as parentheses
|
27
|
+
# lexer.keyword :rparen, ")" # may be defined as keywords
|
28
|
+
#
|
29
|
+
# # Initialize with input
|
30
|
+
# lexer.start "if ( foo ) bar" # initialize the lexer with a string
|
31
|
+
#
|
32
|
+
# # Iterate through tokens
|
33
|
+
# lexer.next_token # => Token (type = :if, value = 'if')
|
34
|
+
# lexer.next_token # => Token (type = :lparen, value = '(')
|
35
|
+
# lexer.next_token # => Token (type = :word, value = 'foo')
|
36
|
+
# lexer.next_token # => Token (type = :rparen, value = ')')
|
37
|
+
# lexer.next_token # => Token (type = :word, value = 'bar')
|
38
|
+
# lexer.next_token # => EOF_TOKEN
|
39
|
+
#
|
40
|
+
class Lexer
|
41
|
+
# Initializes an empty Lexer.
|
42
|
+
#
|
43
|
+
def initialize
|
44
|
+
@ignored = []
|
45
|
+
@rules = []
|
46
|
+
@keywords = {}
|
47
|
+
end
|
48
|
+
|
49
|
+
# Instructs the lexer to ignore input matched by the specified
|
50
|
+
# pattern. If appropriate, call this multiple times to ignore
|
51
|
+
# several patterns.
|
52
|
+
#
|
53
|
+
# @note Ignored input takes precedence over rules and keywords,
|
54
|
+
# so if a prefix is matched by an ignore pattern, it's ignored
|
55
|
+
# even if it's also a keyword or matched by a rule
|
56
|
+
#
|
57
|
+
# @param [Regexp] pattern Pattern of input to ignore
|
58
|
+
#
|
59
|
+
# @return [Regexp] The specified pattern
|
60
|
+
#
|
61
|
+
def ignore(pattern)
|
62
|
+
@ignored << pattern
|
63
|
+
return pattern
|
64
|
+
end
|
65
|
+
|
66
|
+
# Defines a rule to match the specified pattern.
|
67
|
+
#
|
68
|
+
# @note Use keywords for efficiency instead of rules whenever
|
69
|
+
# the matched input is static
|
70
|
+
#
|
71
|
+
# @param [Symbol, #to_sym] name Unique name of rule
|
72
|
+
# @param [Regexp] pattern Pattern of input to match
|
73
|
+
#
|
74
|
+
# @raise [ArgumentError] If the specified name is already
|
75
|
+
# used by other rules or keywords
|
76
|
+
#
|
77
|
+
# @return [Symbol] The name of the rule
|
78
|
+
#
|
79
|
+
def rule(name, pattern)
|
80
|
+
# @todo Validate the rule name
|
81
|
+
@rules << (Rule.new name.to_sym, pattern)
|
82
|
+
return name.to_sym
|
83
|
+
end
|
84
|
+
|
85
|
+
# Defines a static sequence of input as a keyword.
|
86
|
+
#
|
87
|
+
# @note Use keywords for efficiency instead of rules whenever
|
88
|
+
# the matched input is static
|
89
|
+
#
|
90
|
+
# @param [optional, Symbol, #to_sym] name Unique name of the
|
91
|
+
# keyword. If this argument is not given, the keyword is used
|
92
|
+
# to name itself
|
93
|
+
# @param [String, #to_s] kword Sequence of input to match as a
|
94
|
+
# keyword
|
95
|
+
#
|
96
|
+
# @raise [ArgumentError] If the specified name is already
|
97
|
+
# used by other rules or keywords
|
98
|
+
#
|
99
|
+
# @return [Symbol] The name of the keyword
|
100
|
+
#
|
101
|
+
def keyword(name = nil, kword)
|
102
|
+
# @todo Validate the keyword name
|
103
|
+
name = kword if name == nil
|
104
|
+
pattern = Regexp.new(Regexp.escape kword.to_s)
|
105
|
+
rule name, pattern
|
106
|
+
@keywords[kword.to_s] = Token.new name.to_sym, kword.to_s
|
107
|
+
return name.to_sym
|
108
|
+
end
|
109
|
+
|
110
|
+
# Initializes the lexer with new input.
|
111
|
+
#
|
112
|
+
# @note This resets the lexer with a new StringScanner so any
|
113
|
+
# state information related to previous input is lost
|
114
|
+
#
|
115
|
+
# @param [String] input Input to scan for tokens
|
116
|
+
#
|
117
|
+
# @return [String] The specified input
|
118
|
+
#
|
119
|
+
def start(input)
|
120
|
+
@scanner = StringScanner.new input
|
121
|
+
return input
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns the next token matched from the remaining input. If no
|
125
|
+
# input is left, or the lexer has not been initialized,
|
126
|
+
# <tt>EOF_TOKEN</tt> is returned.
|
127
|
+
#
|
128
|
+
# @raise [RuntimeError] If there is any unmatched input
|
129
|
+
#
|
130
|
+
# @return [Token] Next token or <tt>EOF_TOKEN</tt>
|
131
|
+
#
|
132
|
+
def next_token
|
133
|
+
return EOF_TOKEN if @scanner.nil? or @scanner.empty?
|
134
|
+
return next_token if ignore_prefix?
|
135
|
+
rule = greediest_rule
|
136
|
+
if rule
|
137
|
+
prefix = @scanner.scan(rule.pattern)
|
138
|
+
keyword = @keywords[prefix]
|
139
|
+
return keyword ? keyword : Token.new(rule.name, prefix)
|
140
|
+
end
|
141
|
+
raise "unexpected input <#{@scanner.peek(5)}>"
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
|
146
|
+
# @private
|
147
|
+
Rule = Struct.new :name, :pattern
|
148
|
+
|
149
|
+
# @private
|
150
|
+
def ignore_prefix?
|
151
|
+
@ignored.each do |pattern|
|
152
|
+
prefix = @scanner.scan(pattern)
|
153
|
+
return true if prefix
|
154
|
+
end
|
155
|
+
return false
|
156
|
+
end
|
157
|
+
|
158
|
+
# @private
|
159
|
+
def greediest_rule
|
160
|
+
r = nil
|
161
|
+
len = 0
|
162
|
+
@rules.each do |rule|
|
163
|
+
prefix = @scanner.check(rule.pattern)
|
164
|
+
if prefix and prefix.length > len
|
165
|
+
r = rule
|
166
|
+
len = prefix.length
|
167
|
+
end
|
168
|
+
end
|
169
|
+
return r
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
data/lib/rlex/token.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
module Rlex
|
2
|
+
# Simple class to represent tokens matched from the input.
|
3
|
+
#
|
4
|
+
# @author Rasmus Borgsmidt <rasmus@borgsmidt.dk>
|
5
|
+
#
|
6
|
+
# @attr_reader [Symbol] type Type of the token, such as the name of
|
7
|
+
# the rule used to match it
|
8
|
+
# @attr_reader [String] value Text matched from the input
|
9
|
+
#
|
10
|
+
Token = Struct.new :type, :value
|
11
|
+
|
12
|
+
# Special token used when the lexer has reached the end of the
|
13
|
+
# specified input.
|
14
|
+
#
|
15
|
+
EOF_TOKEN = Token.new :eof, ""
|
16
|
+
end
|
data/lib/rlex/version.rb
ADDED
data/rlex.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/rlex/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Rasmus Borgsmidt"]
|
6
|
+
gem.email = ["rasmus@borgsmidt.dk"]
|
7
|
+
gem.description = %q{Implements a simple lexer using a StringScanner}
|
8
|
+
gem.summary = %q{The lexer was written for use with Racc, a
|
9
|
+
Ruby variant of Yacc. But there is no code
|
10
|
+
dependency on that project so the lexer may
|
11
|
+
also be used on its own or with other packages.}
|
12
|
+
gem.homepage = "https://github.com/borgsmidt/rlex"
|
13
|
+
|
14
|
+
gem.files = `git ls-files`.split($\)
|
15
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
+
gem.name = "rlex"
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.version = Rlex::VERSION
|
20
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'rlex/token'
|
3
|
+
require 'rlex/lexer'
|
4
|
+
include Rlex
|
5
|
+
|
6
|
+
describe Lexer do
|
7
|
+
before :each do
|
8
|
+
@lexer = Lexer.new
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "next_token" do
|
12
|
+
it "should return EOF_TOKEN when the lexer has been assigned no input" do
|
13
|
+
@lexer.next_token.should eq EOF_TOKEN
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should return EOF_TOKEN when there is no more input" do
|
17
|
+
@lexer.start ""
|
18
|
+
@lexer.next_token.should eq EOF_TOKEN
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should raise an error when remaining input is not matched by any rules" do
|
22
|
+
@lexer.start "unmatched input"
|
23
|
+
lambda {@lexer.next_token}.should raise_error RuntimeError
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should return EOF_TOKEN when instructed to ignore everything" do
|
27
|
+
@lexer.ignore /.*/
|
28
|
+
@lexer.start "input to be ignored"
|
29
|
+
@lexer.next_token.should eq EOF_TOKEN
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should return recognized keywords and finish with EOF_TOKEN" do
|
33
|
+
@lexer.ignore /\s+/
|
34
|
+
@lexer.keyword :special
|
35
|
+
@lexer.start " \t\nspecialspecial special "
|
36
|
+
special = Token.new :special, "special"
|
37
|
+
@lexer.next_token.should eq special
|
38
|
+
@lexer.next_token.should eq special
|
39
|
+
@lexer.next_token.should eq special
|
40
|
+
@lexer.next_token.should eq EOF_TOKEN
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should return tokens matched by regular rules and finish with EOF_TOKEN" do
|
44
|
+
@lexer.ignore /\s+/
|
45
|
+
@lexer.rule :word, /\w+/
|
46
|
+
@lexer.start "sentence with four tokens"
|
47
|
+
@lexer.next_token.should eq Token.new :word, "sentence"
|
48
|
+
@lexer.next_token.should eq Token.new :word, "with"
|
49
|
+
@lexer.next_token.should eq Token.new :word, "four"
|
50
|
+
@lexer.next_token.should eq Token.new :word, "tokens"
|
51
|
+
@lexer.next_token.should eq EOF_TOKEN
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should return all types of tokens and finish with EOF_TOKEN" do
|
55
|
+
@lexer.ignore /\s+/
|
56
|
+
@lexer.keyword :if
|
57
|
+
@lexer.keyword :lparen, "("
|
58
|
+
@lexer.keyword :rparen, ")"
|
59
|
+
@lexer.rule :word, /\w+/
|
60
|
+
@lexer.start "ifu ( if ) ifu"
|
61
|
+
@lexer.next_token.should eq Token.new :word, "ifu"
|
62
|
+
@lexer.next_token.should eq Token.new :lparen, "("
|
63
|
+
@lexer.next_token.should eq Token.new :if, "if"
|
64
|
+
@lexer.next_token.should eq Token.new :rparen, ")"
|
65
|
+
@lexer.next_token.should eq Token.new :word, "ifu"
|
66
|
+
@lexer.next_token.should eq EOF_TOKEN
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper.rb"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rlex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.5.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Rasmus Borgsmidt
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2012-04-24 00:00:00 Z
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Implements a simple lexer using a StringScanner
|
17
|
+
email:
|
18
|
+
- rasmus@borgsmidt.dk
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files: []
|
24
|
+
|
25
|
+
files:
|
26
|
+
- .gitignore
|
27
|
+
- .rspec
|
28
|
+
- Gemfile
|
29
|
+
- LICENSE
|
30
|
+
- README.md
|
31
|
+
- Rakefile
|
32
|
+
- lib/rlex.rb
|
33
|
+
- lib/rlex/lexer.rb
|
34
|
+
- lib/rlex/token.rb
|
35
|
+
- lib/rlex/version.rb
|
36
|
+
- rlex.gemspec
|
37
|
+
- spec/rlex/lexer_spec.rb
|
38
|
+
- spec/spec_helper.rb
|
39
|
+
homepage: https://github.com/borgsmidt/rlex
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
requirements: []
|
60
|
+
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 1.8.11
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: The lexer was written for use with Racc, a Ruby variant of Yacc. But there is no code dependency on that project so the lexer may also be used on its own or with other packages.
|
66
|
+
test_files:
|
67
|
+
- spec/rlex/lexer_spec.rb
|
68
|
+
- spec/spec_helper.rb
|
69
|
+
has_rdoc:
|