minilex 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -0
- data/README.md +103 -0
- data/Rakefile +29 -0
- data/lib/minilex.rb +144 -0
- data/spec/lexer_spec.rb +78 -0
- metadata +63 -0
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2012 Arun Srinivasan
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
5
|
+
the Software without restriction, including without limitation the rights to
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# Minilex
|
2
|
+
|
3
|
+
A little lexer toolkit, for basic lexing needs.
|
4
|
+
|
5
|
+
It's designed for the cases where parsers do the parsing, and all you need from
|
6
|
+
your lexer is an array of simple tokens.
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
Expression = Minilex::Lexer.new do
|
12
|
+
skip :whitespace, /\s+/
|
13
|
+
tok :number, /\d+(?:\.\d+)?/
|
14
|
+
tok :operator, /[\+\=\/\*]/
|
15
|
+
end
|
16
|
+
|
17
|
+
Expression.lex('1 + 2.34')
|
18
|
+
# => [[:number, '1', 1, 0],
|
19
|
+
# [:operator, '+', 1, 3],
|
20
|
+
# [:number, '2.34', 1, 5]
|
21
|
+
# [:eos]]
|
22
|
+
```
|
23
|
+
|
24
|
+
To create a lexer with Lex, instantiate a `Minilex::Lexer` and define rules.
|
25
|
+
|
26
|
+
There are two methods for defining rules, `skip` and `tok`:
|
27
|
+
|
28
|
+
`skip` takes an `id` and a `pattern`. The lexer will ignore all occurrences of
|
29
|
+
the pattern in the input text. The `id` isn't strictly necessary, but it's nice
|
30
|
+
for readability and is a required argument.
|
31
|
+
|
32
|
+
`tok` also takes an `id` and a `pattern`. The lexer will turn all occurrences
|
33
|
+
of the pattern into a token of the form:
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
[id, value, line, offset]
|
37
|
+
|
38
|
+
# id - the id you provided
|
39
|
+
# value - the matched value
|
40
|
+
# line - line number
|
41
|
+
# offset - character position in the line
|
42
|
+
```
|
43
|
+
|
44
|
+
## Overriding the token format
|
45
|
+
|
46
|
+
If you'd like to customize the token format, override `append_token`:
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
Digits = Minilex::Lexer.new do
|
50
|
+
skip :whitespace, /\s+/
|
51
|
+
tok :digit, /\d/
|
52
|
+
|
53
|
+
# id - the id of the matched rule
|
54
|
+
# value - the value that was matched
|
55
|
+
#
|
56
|
+
# You have access to the array of tokens via `tokens` and the current
|
57
|
+
# token's position # information via `pos`.
|
58
|
+
def append_token(id, value)
|
59
|
+
tokens << Integer(value)
|
60
|
+
end
|
61
|
+
|
62
|
+
# By default, the lexer will append an end-of-stream token to the end of
|
63
|
+
# the tokens array. You can override what the eos token is or even suppress
|
64
|
+
# it altogether with the append_eos callback.
|
65
|
+
#
|
66
|
+
# Here we'll suppress it by doing nothing
|
67
|
+
def append_eos
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
digits.lex('1 2 3 4')
|
72
|
+
# => [1, 2, 3, 4]
|
73
|
+
```
|
74
|
+
|
75
|
+
## Processing values
|
76
|
+
|
77
|
+
There's one more thing you can do. It's just for convenience, though I'm not
|
78
|
+
sure it really belongs in something that's supposed to do as little as
|
79
|
+
possible. I might remove it.
|
80
|
+
|
81
|
+
The `tok` method accepts a third optional `processor` argument, which should
|
82
|
+
name a method on the lexer (you'll have to write the method, of course).
|
83
|
+
|
84
|
+
What this will do is give you a chance to get at the matched text before it
|
85
|
+
gets stuffed into a token:
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
DigitsConverter = Minilex::Lexer.new do
|
89
|
+
skip :whitespace, /\s+/
|
90
|
+
tok :digit, /\d/, :integer
|
91
|
+
|
92
|
+
def integer(str)
|
93
|
+
Integer(str)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
DigitsConverter.lex('123')
|
98
|
+
# => [[:digit, 1, 1, 0], [:digit, 2, 1, 1], [:digit, 3, 1, 2], [:eos]]
|
99
|
+
# ^ ^ ^
|
100
|
+
# ^ ^ ^
|
101
|
+
# These are Integers (would have been Strings)
|
102
|
+
```
|
103
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.expand_path('../lib/minilex', __FILE__)
|
2
|
+
|
3
|
+
version = Minilex::VERSION
|
4
|
+
name = 'minilex'
|
5
|
+
|
6
|
+
desc "Build minilex gem"
|
7
|
+
task :build => :clean do
|
8
|
+
sh "mkdir -p pkg"
|
9
|
+
sh "gem build minilex.gemspec"
|
10
|
+
end
|
11
|
+
|
12
|
+
desc "Create tag v#{version}, build, and push to Rubygems"
|
13
|
+
task :release => :build do
|
14
|
+
unless `git branch` =~ /^\* master$/
|
15
|
+
puts "You must be on the master branch to release!"
|
16
|
+
exit!
|
17
|
+
end
|
18
|
+
sh "git commit --allow-empty -a -m 'Release #{version}'"
|
19
|
+
sh "git tag v#{version}"
|
20
|
+
sh "git push origin master"
|
21
|
+
sh "git push origin v#{version}"
|
22
|
+
sh "gem push #{name}-#{version}.gem"
|
23
|
+
end
|
24
|
+
|
25
|
+
desc "Clean up generated files"
|
26
|
+
task :clean do
|
27
|
+
sh "rm -f *.gem"
|
28
|
+
end
|
29
|
+
|
data/lib/minilex.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Minilex
|
4
|
+
Rule = Struct.new(:id, :pattern, :processor, :skip)
|
5
|
+
Pos = Struct.new(:line, :offset)
|
6
|
+
|
7
|
+
class Lexer
|
8
|
+
attr_reader :rules, :tokens, :pos, :scanner
|
9
|
+
|
10
|
+
# Creates a Lexer instance
|
11
|
+
#
|
12
|
+
# Expression = Minilex::Lexer.new do
|
13
|
+
# skip :whitespace, /\s+/
|
14
|
+
# tok :number, /\d+(?:\.\d+)?/
|
15
|
+
# tok :operator, /[\+\=\/\*]/
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# You don't have to pass a block. This also works:
|
19
|
+
#
|
20
|
+
# Expression = Minilex::Lexer.new
|
21
|
+
# Expression.skip :whitespace, /\s+/
|
22
|
+
# Expression.tok :number, /\d+(?:\.\d+)?/
|
23
|
+
# Expression.tok :operator, /[\+\=\/\*]/
|
24
|
+
def initialize(&block)
|
25
|
+
@rules = []
|
26
|
+
instance_eval &block if block
|
27
|
+
end
|
28
|
+
|
29
|
+
# Defines a token-matching rule
|
30
|
+
#
|
31
|
+
# id - this token's identifier
|
32
|
+
# pattern - a Regexp to match this token
|
33
|
+
# processor - a Sym that references a method on
|
34
|
+
# this Lexer instance, which will
|
35
|
+
# be called to produce the `value`
|
36
|
+
# for this token (defaults to nil)
|
37
|
+
def tok(id, pattern, processor=nil)
|
38
|
+
rules << Rule.new(id, pattern, processor)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Defines patterns to ignore
|
42
|
+
#
|
43
|
+
# id - an identifier, it's nice to name things
|
44
|
+
# pattern - the Regexp to skip
|
45
|
+
def skip(id, pattern)
|
46
|
+
rules << Rule.new(id, pattern, nil, true)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Runs the lexer on the given input
|
50
|
+
#
|
51
|
+
# returns an Array of tokens
|
52
|
+
def lex(input)
|
53
|
+
@tokens = []
|
54
|
+
@pos = Pos.new(1, 0)
|
55
|
+
@scanner = StringScanner.new(input)
|
56
|
+
|
57
|
+
until scanner.eos?
|
58
|
+
rule, text = match
|
59
|
+
value = rule.processor ? send(rule.processor, text) : text
|
60
|
+
append_token(rule.id, value) unless rule.skip
|
61
|
+
update_pos(text)
|
62
|
+
end
|
63
|
+
|
64
|
+
append_eos
|
65
|
+
tokens
|
66
|
+
end
|
67
|
+
|
68
|
+
# Makes a token
|
69
|
+
#
|
70
|
+
# id - the id of the matched rule
|
71
|
+
# value - the value that was matched
|
72
|
+
#
|
73
|
+
# Called when a rule is matched to build the
|
74
|
+
# resulting token.
|
75
|
+
#
|
76
|
+
# Override this method if you'd like your tokens
|
77
|
+
# in a different form. You have access to the
|
78
|
+
# array of tokens via `tokens` and the current
|
79
|
+
# token's position information via `pos`.
|
80
|
+
#
|
81
|
+
# returns an Array of [id, value, line, offset]
|
82
|
+
def append_token(id, value)
|
83
|
+
tokens << [id, value, pos.line, pos.offset]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Makes the end-of-stream token
|
87
|
+
#
|
88
|
+
# Similar to `append_token`, used to make the final
|
89
|
+
# token. Append [:eos] to the `tokens` array.
|
90
|
+
def append_eos
|
91
|
+
tokens << [:eos]
|
92
|
+
end
|
93
|
+
|
94
|
+
# [internal] Finds the matching rule
|
95
|
+
#
|
96
|
+
# Tries the rules in defined order until there's
|
97
|
+
# a match. Raise an UnrecognizedInput error if
|
98
|
+
# ther isn't one.
|
99
|
+
#
|
100
|
+
# returns a 2-element Array of [rule, matched_text]
|
101
|
+
def match
|
102
|
+
rules.each do |rule|
|
103
|
+
next unless text = scanner.scan(rule.pattern)
|
104
|
+
return [rule, text]
|
105
|
+
end
|
106
|
+
raise UnrecognizedInput.new(scanner, pos)
|
107
|
+
end
|
108
|
+
|
109
|
+
# [internal] Updates the position information
|
110
|
+
#
|
111
|
+
# text - the String that was matched by `match`
|
112
|
+
#
|
113
|
+
# Inspects the matched text for newlines and updates
|
114
|
+
# the line number and offset accordingly
|
115
|
+
def update_pos(text)
|
116
|
+
pos.line += newlines = text.count(?\n)
|
117
|
+
if newlines > 0
|
118
|
+
pos.offset = text.rpartition(?\n)[2].length
|
119
|
+
else
|
120
|
+
pos.offset += text.length
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# The error raised when a Lexer can't match some input
|
126
|
+
#
|
127
|
+
# It will show the offending characters and tell you
|
128
|
+
# where in the input it was when it got confused.
|
129
|
+
class UnrecognizedInput < StandardError
|
130
|
+
attr_reader :scanner, :pos
|
131
|
+
|
132
|
+
def initialize(scanner, pos)
|
133
|
+
@scanner = scanner
|
134
|
+
@pos = pos
|
135
|
+
end
|
136
|
+
|
137
|
+
def to_s
|
138
|
+
"\"#{scanner.peek(10)}\" at line:#{pos.line}, offset:#{pos.offset}"
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
Minilex::VERSION = '0.1.0'
|
144
|
+
|
data/spec/lexer_spec.rb
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require'minilex'
|
3
|
+
|
4
|
+
describe "A simple lexer" do
|
5
|
+
let(:lexer) do
|
6
|
+
Minilex::Lexer.new do
|
7
|
+
skip :whitespace, /\s+/
|
8
|
+
tok :digit, /\d/
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
it "raises an error on unrecognized input" do
|
13
|
+
expect do
|
14
|
+
lexer.lex('123abc')
|
15
|
+
end.to raise_error(Minilex::UnrecognizedInput)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "returns a single :eos token on empty input" do
|
19
|
+
lexer.lex('').should == [[:eos]]
|
20
|
+
end
|
21
|
+
|
22
|
+
it "recognizes a single digit" do
|
23
|
+
lexer.lex('1').should == [[:digit, '1', 1, 0], [:eos]]
|
24
|
+
end
|
25
|
+
|
26
|
+
it "recognizes multiple digits" do
|
27
|
+
lexer.lex('123').should ==
|
28
|
+
[[:digit, '1', 1, 0], [:digit, '2', 1, 1], [:digit, '3', 1, 2], [:eos]]
|
29
|
+
end
|
30
|
+
|
31
|
+
it "skips whitespace" do
|
32
|
+
lexer.lex('1 2 3').should ==
|
33
|
+
[[:digit, '1', 1, 0], [:digit, '2', 1, 2], [:digit, '3', 1, 4], [:eos]]
|
34
|
+
end
|
35
|
+
|
36
|
+
it "keeps track of line numbers" do
|
37
|
+
lexer.lex("1\n2\n3").should ==
|
38
|
+
[[:digit, '1', 1, 0], [:digit, '2', 2, 0], [:digit, '3', 3, 0], [:eos]]
|
39
|
+
end
|
40
|
+
|
41
|
+
it "keeps track of line offsets" do
|
42
|
+
lexer.lex(" 1\n 2\n 3").should ==
|
43
|
+
[[:digit, '1', 1, 1], [:digit, '2', 2, 2], [:digit, '3', 3, 3], [:eos]]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "A simple lexer with converter" do
|
48
|
+
let(:lexer) do
|
49
|
+
Minilex::Lexer.new do
|
50
|
+
skip :whitespace, /\s+/
|
51
|
+
tok :digit, /\d/, :integer
|
52
|
+
|
53
|
+
def integer(str); Integer(str); end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
it "convert the digit value to an integer" do
|
58
|
+
lexer.lex('123').should ==
|
59
|
+
[[:digit, 1, 1, 0], [:digit, 2, 1, 1], [:digit, 3, 1, 2], [:eos]]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
describe "Overriding how tokens are made" do
|
64
|
+
let(:lexer) do
|
65
|
+
Minilex::Lexer.new do
|
66
|
+
skip :whitespace, /\s+/
|
67
|
+
tok :digit, /\d/
|
68
|
+
|
69
|
+
def append_token(id, value); tokens << Integer(value) ** 2; end
|
70
|
+
def append_eos; tokens << "Zanzibar!"; end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
it "returns tokens from the overwritten :append_{token|eos} methods" do
|
75
|
+
lexer.lex('123').should == [1, 4, 9, "Zanzibar!"]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: minilex
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- satchmorun
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-04-30 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70325614561220 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70325614561220
|
25
|
+
description: A little lexer toolkit, designed for the cases where parsers do the parsing
|
26
|
+
and lexers do the lexing.
|
27
|
+
email: rulfzid@gmail.com
|
28
|
+
executables: []
|
29
|
+
extensions: []
|
30
|
+
extra_rdoc_files: []
|
31
|
+
files:
|
32
|
+
- LICENSE
|
33
|
+
- README.md
|
34
|
+
- Rakefile
|
35
|
+
- lib/minilex.rb
|
36
|
+
- spec/lexer_spec.rb
|
37
|
+
homepage: http://github.com/satchmorun/minilex
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 1.8.10
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: A little lexer toolkit.
|
62
|
+
test_files:
|
63
|
+
- spec/lexer_spec.rb
|