minilex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -0
- data/README.md +103 -0
- data/Rakefile +29 -0
- data/lib/minilex.rb +144 -0
- data/spec/lexer_spec.rb +78 -0
- metadata +63 -0
data/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2012 Arun Srinivasan
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
5
|
+
the Software without restriction, including without limitation the rights to
|
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
7
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
8
|
+
so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Minilex
|
|
2
|
+
|
|
3
|
+
A little lexer toolkit, for basic lexing needs.
|
|
4
|
+
|
|
5
|
+
It's designed for the cases where parsers do the parsing, and all you need from
|
|
6
|
+
your lexer is an array of simple tokens.
|
|
7
|
+
|
|
8
|
+
## Usage
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
Expression = Minilex::Lexer.new do
|
|
12
|
+
skip :whitespace, /\s+/
|
|
13
|
+
tok :number, /\d+(?:\.\d+)?/
|
|
14
|
+
tok :operator, /[\+\=\/\*]/
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
Expression.lex('1 + 2.34')
|
|
18
|
+
# => [[:number, '1', 1, 0],
|
|
19
|
+
# [:operator, '+', 1, 3],
|
|
20
|
+
# [:number, '2.34', 1, 5]
|
|
21
|
+
# [:eos]]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
To create a lexer with Lex, instantiate a `Minilex::Lexer` and define rules.
|
|
25
|
+
|
|
26
|
+
There are two methods for defining rules, `skip` and `tok`:
|
|
27
|
+
|
|
28
|
+
`skip` takes an `id` and a `pattern`. The lexer will ignore all occurrences of
|
|
29
|
+
the pattern in the input text. The `id` isn't strictly necessary, but it's nice
|
|
30
|
+
for readability and is a required argument.
|
|
31
|
+
|
|
32
|
+
`tok` also takes an `id` and a `pattern`. The lexer will turn all occurrences
|
|
33
|
+
of the pattern into a token of the form:
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
[id, value, line, offset]
|
|
37
|
+
|
|
38
|
+
# id - the id you provided
|
|
39
|
+
# value - the matched value
|
|
40
|
+
# line - line number
|
|
41
|
+
# offset - character position in the line
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Overriding the token format
|
|
45
|
+
|
|
46
|
+
If you'd like to customize the token format, override `append_token`:
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
Digits = Minilex::Lexer.new do
|
|
50
|
+
skip :whitespace, /\s+/
|
|
51
|
+
tok :digit, /\d/
|
|
52
|
+
|
|
53
|
+
# id - the id of the matched rule
|
|
54
|
+
# value - the value that was matched
|
|
55
|
+
#
|
|
56
|
+
# You have access to the array of tokens via `tokens` and the current
|
|
57
|
+
# token's position # information via `pos`.
|
|
58
|
+
def append_token(id, value)
|
|
59
|
+
tokens << Integer(value)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# By default, the lexer will append an end-of-stream token to the end of
|
|
63
|
+
# the tokens array. You can override what the eos token is or even suppress
|
|
64
|
+
# it altogether with the append_eos callback.
|
|
65
|
+
#
|
|
66
|
+
# Here we'll suppress it by doing nothing
|
|
67
|
+
def append_eos
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
digits.lex('1 2 3 4')
|
|
72
|
+
# => [1, 2, 3, 4]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Processing values
|
|
76
|
+
|
|
77
|
+
There's one more thing you can do. It's just for convenience, though I'm not
|
|
78
|
+
sure it really belongs in something that's supposed to do as little as
|
|
79
|
+
possible. I might remove it.
|
|
80
|
+
|
|
81
|
+
The `tok` method accepts a third optional `processor` argument, which should
|
|
82
|
+
name a method on the lexer (you'll have to write the method, of course).
|
|
83
|
+
|
|
84
|
+
What this will do is give you a chance to get at the matched text before it
|
|
85
|
+
gets stuffed into a token:
|
|
86
|
+
|
|
87
|
+
```ruby
|
|
88
|
+
DigitsConverter = Minilex::Lexer.new do
|
|
89
|
+
skip :whitespace, /\s+/
|
|
90
|
+
tok :digit, /\d/, :integer
|
|
91
|
+
|
|
92
|
+
def integer(str)
|
|
93
|
+
Integer(str)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
DigitsConverter.lex('123')
|
|
98
|
+
# => [[:digit, 1, 1, 0], [:digit, 2, 1, 1], [:digit, 3, 1, 2], [:eos]]
|
|
99
|
+
# ^ ^ ^
|
|
100
|
+
# ^ ^ ^
|
|
101
|
+
# These are Integers (would have been Strings)
|
|
102
|
+
```
|
|
103
|
+
|
data/Rakefile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require File.expand_path('../lib/minilex', __FILE__)
|
|
2
|
+
|
|
3
|
+
version = Minilex::VERSION
|
|
4
|
+
name = 'minilex'
|
|
5
|
+
|
|
6
|
+
desc "Build minilex gem"
|
|
7
|
+
task :build => :clean do
|
|
8
|
+
sh "mkdir -p pkg"
|
|
9
|
+
sh "gem build minilex.gemspec"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
desc "Create tag v#{version}, build, and push to Rubygems"
|
|
13
|
+
task :release => :build do
|
|
14
|
+
unless `git branch` =~ /^\* master$/
|
|
15
|
+
puts "You must be on the master branch to release!"
|
|
16
|
+
exit!
|
|
17
|
+
end
|
|
18
|
+
sh "git commit --allow-empty -a -m 'Release #{version}'"
|
|
19
|
+
sh "git tag v#{version}"
|
|
20
|
+
sh "git push origin master"
|
|
21
|
+
sh "git push origin v#{version}"
|
|
22
|
+
sh "gem push #{name}-#{version}.gem"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
desc "Clean up generated files"
|
|
26
|
+
task :clean do
|
|
27
|
+
sh "rm -f *.gem"
|
|
28
|
+
end
|
|
29
|
+
|
data/lib/minilex.rb
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
require 'strscan'
|
|
2
|
+
|
|
3
|
+
module Minilex
|
|
4
|
+
Rule = Struct.new(:id, :pattern, :processor, :skip)
|
|
5
|
+
Pos = Struct.new(:line, :offset)
|
|
6
|
+
|
|
7
|
+
class Lexer
|
|
8
|
+
attr_reader :rules, :tokens, :pos, :scanner
|
|
9
|
+
|
|
10
|
+
# Creates a Lexer instance
|
|
11
|
+
#
|
|
12
|
+
# Expression = Minilex::Lexer.new do
|
|
13
|
+
# skip :whitespace, /\s+/
|
|
14
|
+
# tok :number, /\d+(?:\.\d+)?/
|
|
15
|
+
# tok :operator, /[\+\=\/\*]/
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
# You don't have to pass a block. This also works:
|
|
19
|
+
#
|
|
20
|
+
# Expression = Minilex::Lexer.new
|
|
21
|
+
# Expression.skip :whitespace, /\s+/
|
|
22
|
+
# Expression.tok :number, /\d+(?:\.\d+)?/
|
|
23
|
+
# Expression.tok :operator, /[\+\=\/\*]/
|
|
24
|
+
def initialize(&block)
|
|
25
|
+
@rules = []
|
|
26
|
+
instance_eval &block if block
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Defines a token-matching rule
|
|
30
|
+
#
|
|
31
|
+
# id - this token's identifier
|
|
32
|
+
# pattern - a Regexp to match this token
|
|
33
|
+
# processor - a Sym that references a method on
|
|
34
|
+
# this Lexer instance, which will
|
|
35
|
+
# be called to produce the `value`
|
|
36
|
+
# for this token (defaults to nil)
|
|
37
|
+
def tok(id, pattern, processor=nil)
|
|
38
|
+
rules << Rule.new(id, pattern, processor)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Defines patterns to ignore
|
|
42
|
+
#
|
|
43
|
+
# id - an identifier, it's nice to name things
|
|
44
|
+
# pattern - the Regexp to skip
|
|
45
|
+
def skip(id, pattern)
|
|
46
|
+
rules << Rule.new(id, pattern, nil, true)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Runs the lexer on the given input
|
|
50
|
+
#
|
|
51
|
+
# returns an Array of tokens
|
|
52
|
+
def lex(input)
|
|
53
|
+
@tokens = []
|
|
54
|
+
@pos = Pos.new(1, 0)
|
|
55
|
+
@scanner = StringScanner.new(input)
|
|
56
|
+
|
|
57
|
+
until scanner.eos?
|
|
58
|
+
rule, text = match
|
|
59
|
+
value = rule.processor ? send(rule.processor, text) : text
|
|
60
|
+
append_token(rule.id, value) unless rule.skip
|
|
61
|
+
update_pos(text)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
append_eos
|
|
65
|
+
tokens
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Makes a token
|
|
69
|
+
#
|
|
70
|
+
# id - the id of the matched rule
|
|
71
|
+
# value - the value that was matched
|
|
72
|
+
#
|
|
73
|
+
# Called when a rule is matched to build the
|
|
74
|
+
# resulting token.
|
|
75
|
+
#
|
|
76
|
+
# Override this method if you'd like your tokens
|
|
77
|
+
# in a different form. You have access to the
|
|
78
|
+
# array of tokens via `tokens` and the current
|
|
79
|
+
# token's position information via `pos`.
|
|
80
|
+
#
|
|
81
|
+
# returns an Array of [id, value, line, offset]
|
|
82
|
+
def append_token(id, value)
|
|
83
|
+
tokens << [id, value, pos.line, pos.offset]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Makes the end-of-stream token
|
|
87
|
+
#
|
|
88
|
+
# Similar to `append_token`, used to make the final
|
|
89
|
+
# token. Append [:eos] to the `tokens` array.
|
|
90
|
+
def append_eos
|
|
91
|
+
tokens << [:eos]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# [internal] Finds the matching rule
|
|
95
|
+
#
|
|
96
|
+
# Tries the rules in defined order until there's
|
|
97
|
+
# a match. Raise an UnrecognizedInput error if
|
|
98
|
+
# ther isn't one.
|
|
99
|
+
#
|
|
100
|
+
# returns a 2-element Array of [rule, matched_text]
|
|
101
|
+
def match
|
|
102
|
+
rules.each do |rule|
|
|
103
|
+
next unless text = scanner.scan(rule.pattern)
|
|
104
|
+
return [rule, text]
|
|
105
|
+
end
|
|
106
|
+
raise UnrecognizedInput.new(scanner, pos)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# [internal] Updates the position information
|
|
110
|
+
#
|
|
111
|
+
# text - the String that was matched by `match`
|
|
112
|
+
#
|
|
113
|
+
# Inspects the matched text for newlines and updates
|
|
114
|
+
# the line number and offset accordingly
|
|
115
|
+
def update_pos(text)
|
|
116
|
+
pos.line += newlines = text.count(?\n)
|
|
117
|
+
if newlines > 0
|
|
118
|
+
pos.offset = text.rpartition(?\n)[2].length
|
|
119
|
+
else
|
|
120
|
+
pos.offset += text.length
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# The error raised when a Lexer can't match some input
|
|
126
|
+
#
|
|
127
|
+
# It will show the offending characters and tell you
|
|
128
|
+
# where in the input it was when it got confused.
|
|
129
|
+
class UnrecognizedInput < StandardError
|
|
130
|
+
attr_reader :scanner, :pos
|
|
131
|
+
|
|
132
|
+
def initialize(scanner, pos)
|
|
133
|
+
@scanner = scanner
|
|
134
|
+
@pos = pos
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def to_s
|
|
138
|
+
"\"#{scanner.peek(10)}\" at line:#{pos.line}, offset:#{pos.offset}"
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
Minilex::VERSION = '0.1.0'
|
|
144
|
+
|
data/spec/lexer_spec.rb
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
require 'rspec'
|
|
2
|
+
require'minilex'
|
|
3
|
+
|
|
4
|
+
describe "A simple lexer" do
|
|
5
|
+
let(:lexer) do
|
|
6
|
+
Minilex::Lexer.new do
|
|
7
|
+
skip :whitespace, /\s+/
|
|
8
|
+
tok :digit, /\d/
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "raises an error on unrecognized input" do
|
|
13
|
+
expect do
|
|
14
|
+
lexer.lex('123abc')
|
|
15
|
+
end.to raise_error(Minilex::UnrecognizedInput)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it "returns a single :eos token on empty input" do
|
|
19
|
+
lexer.lex('').should == [[:eos]]
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "recognizes a single digit" do
|
|
23
|
+
lexer.lex('1').should == [[:digit, '1', 1, 0], [:eos]]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "recognizes multiple digits" do
|
|
27
|
+
lexer.lex('123').should ==
|
|
28
|
+
[[:digit, '1', 1, 0], [:digit, '2', 1, 1], [:digit, '3', 1, 2], [:eos]]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
it "skips whitespace" do
|
|
32
|
+
lexer.lex('1 2 3').should ==
|
|
33
|
+
[[:digit, '1', 1, 0], [:digit, '2', 1, 2], [:digit, '3', 1, 4], [:eos]]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it "keeps track of line numbers" do
|
|
37
|
+
lexer.lex("1\n2\n3").should ==
|
|
38
|
+
[[:digit, '1', 1, 0], [:digit, '2', 2, 0], [:digit, '3', 3, 0], [:eos]]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "keeps track of line offsets" do
|
|
42
|
+
lexer.lex(" 1\n 2\n 3").should ==
|
|
43
|
+
[[:digit, '1', 1, 1], [:digit, '2', 2, 2], [:digit, '3', 3, 3], [:eos]]
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
describe "A simple lexer with converter" do
|
|
48
|
+
let(:lexer) do
|
|
49
|
+
Minilex::Lexer.new do
|
|
50
|
+
skip :whitespace, /\s+/
|
|
51
|
+
tok :digit, /\d/, :integer
|
|
52
|
+
|
|
53
|
+
def integer(str); Integer(str); end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it "convert the digit value to an integer" do
|
|
58
|
+
lexer.lex('123').should ==
|
|
59
|
+
[[:digit, 1, 1, 0], [:digit, 2, 1, 1], [:digit, 3, 1, 2], [:eos]]
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe "Overriding how tokens are made" do
|
|
64
|
+
let(:lexer) do
|
|
65
|
+
Minilex::Lexer.new do
|
|
66
|
+
skip :whitespace, /\s+/
|
|
67
|
+
tok :digit, /\d/
|
|
68
|
+
|
|
69
|
+
def append_token(id, value); tokens << Integer(value) ** 2; end
|
|
70
|
+
def append_eos; tokens << "Zanzibar!"; end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "returns tokens from the overwritten :append_{token|eos} methods" do
|
|
75
|
+
lexer.lex('123').should == [1, 4, 9, "Zanzibar!"]
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
metadata
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: minilex
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
prerelease:
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- satchmorun
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2012-04-30 00:00:00.000000000 Z
|
|
13
|
+
dependencies:
|
|
14
|
+
- !ruby/object:Gem::Dependency
|
|
15
|
+
name: rspec
|
|
16
|
+
requirement: &70325614561220 !ruby/object:Gem::Requirement
|
|
17
|
+
none: false
|
|
18
|
+
requirements:
|
|
19
|
+
- - ! '>='
|
|
20
|
+
- !ruby/object:Gem::Version
|
|
21
|
+
version: '0'
|
|
22
|
+
type: :development
|
|
23
|
+
prerelease: false
|
|
24
|
+
version_requirements: *70325614561220
|
|
25
|
+
description: A little lexer toolkit, designed for the cases where parsers do the parsing
|
|
26
|
+
and lexers do the lexing.
|
|
27
|
+
email: rulfzid@gmail.com
|
|
28
|
+
executables: []
|
|
29
|
+
extensions: []
|
|
30
|
+
extra_rdoc_files: []
|
|
31
|
+
files:
|
|
32
|
+
- LICENSE
|
|
33
|
+
- README.md
|
|
34
|
+
- Rakefile
|
|
35
|
+
- lib/minilex.rb
|
|
36
|
+
- spec/lexer_spec.rb
|
|
37
|
+
homepage: http://github.com/satchmorun/minilex
|
|
38
|
+
licenses:
|
|
39
|
+
- MIT
|
|
40
|
+
post_install_message:
|
|
41
|
+
rdoc_options: []
|
|
42
|
+
require_paths:
|
|
43
|
+
- lib
|
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
|
+
none: false
|
|
46
|
+
requirements:
|
|
47
|
+
- - ! '>='
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: '0'
|
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
51
|
+
none: false
|
|
52
|
+
requirements:
|
|
53
|
+
- - ! '>='
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: '0'
|
|
56
|
+
requirements: []
|
|
57
|
+
rubyforge_project:
|
|
58
|
+
rubygems_version: 1.8.10
|
|
59
|
+
signing_key:
|
|
60
|
+
specification_version: 3
|
|
61
|
+
summary: A little lexer toolkit.
|
|
62
|
+
test_files:
|
|
63
|
+
- spec/lexer_spec.rb
|