scanner 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +131 -4
- data/lib/scanner/scanner.rb +29 -14
- data/lib/scanner/version.rb +1 -1
- data/spec/scanner/scanner_spec.rb +15 -0
- metadata +4 -4
data/README.md
CHANGED
@@ -24,19 +24,146 @@ Scanner is a module that you can include in your classes. It defines a
|
|
24
24
|
token function that accepts the regular expression that the token
|
25
25
|
matches.
|
26
26
|
|
27
|
-
|
27
|
+
For example
|
28
28
|
|
29
29
|
class TestScanner
|
30
30
|
include Scanner
|
31
|
-
ignore
|
32
|
-
token :number,
|
33
|
-
token :id,
|
31
|
+
ignore '\s+'
|
32
|
+
token :number, '\d+'
|
33
|
+
token :id, '[a-z]+'
|
34
34
|
end
|
35
35
|
|
36
36
|
@scanner = TestScanner.new
|
37
37
|
@scanner.parse("123")
|
38
38
|
@scanner.look_ahead.is?(:number) # Should be true
|
39
39
|
|
40
|
+
### Token definition
|
41
|
+
Each token is defined by a symbol, used to identify the token, and a
|
42
|
+
regular expression that the token should match. An optional third
|
43
|
+
parameter accepts a hash of options that we will explore later. For
|
44
|
+
example
|
45
|
+
|
46
|
+
token :number, '\d+'
|
47
|
+
|
48
|
+
will match strings containing digits.
|
49
|
+
|
50
|
+
Some care is needed when defining tokens that collide with other
|
51
|
+
tokens. For instance, a languange may define the token '==' and the
|
52
|
+
token '='. You need to define the double equals before the single
|
53
|
+
equals, otherwise the string '==' will be identified as two '=' tokens,
|
54
|
+
instead of a '==' token.
|
55
|
+
|
56
|
+
### Ignoring characters
|
57
|
+
For many scanning needs, there is a set of characters that is safely
|
58
|
+
ignored, for instace, in many programming languages, spaces and
|
59
|
+
newlines. You can define the set of characters to ignore with the
|
60
|
+
following definition:
|
61
|
+
|
62
|
+
ignore '[\s|\n]+'
|
63
|
+
|
64
|
+
### Defining keywords
|
65
|
+
For many scanning needs, there is a set of tokens that define the
|
66
|
+
reserved words or keywords of a language. For instance, in Ruby, the
|
67
|
+
tokens 'def', 'class', 'module', and so on, are language reserved words.
|
68
|
+
Usually, these tokens are a subset of a larger token group, called
|
69
|
+
identifiers or ids. You can define a family of reserved words by using
|
70
|
+
the 'keywords' function.
|
71
|
+
|
72
|
+
ignore '[\s|\n]+'
|
73
|
+
token :id, '[a-z]+'
|
74
|
+
keywords %w{def class module}
|
75
|
+
|
76
|
+
@scanner.parse("other def")
|
77
|
+
@scanner.lookahead.is?(:id)
|
78
|
+
@scanner.lookahead(2).is?(:def)
|
79
|
+
|
80
|
+
Note that you will need to have a token definition that matches those
|
81
|
+
keywords, as the token :id in the previous example.
|
82
|
+
|
83
|
+
### Consuming tokens and looking ahead
|
84
|
+
The Scanner method consume will try to match the first token remaining
|
85
|
+
in the input string. If successful, it will return the token, and remove
|
86
|
+
it from the input string.
|
87
|
+
|
88
|
+
ignore '[\s|\n]+'
|
89
|
+
token :id, '[a-z]+'
|
90
|
+
|
91
|
+
@scanner.parse("one two")
|
92
|
+
@scanner.consume.content == "one"
|
93
|
+
@scanner.consume.content == "two"
|
94
|
+
|
95
|
+
Lookahead performs a similar function, but without removing the token
|
96
|
+
from the string. It accepts an optional parameter indicating the number
|
97
|
+
of tokens to look ahead.
|
98
|
+
|
99
|
+
@scanner.parse("one two")
|
100
|
+
@scanner.lookahead.content == "one"
|
101
|
+
@scanner.lookahead(2).content == "two"
|
102
|
+
|
103
|
+
### End of file
|
104
|
+
|
105
|
+
ignore '\s+'
|
106
|
+
token :number, '\d+'
|
107
|
+
token :id, '[a-z]+'
|
108
|
+
|
109
|
+
@scanner = TestScanner.new
|
110
|
+
@scanner.parse("123 abc 456 other")
|
111
|
+
begin
|
112
|
+
token = @scanner.consume
|
113
|
+
puts token.content
|
114
|
+
end while token.is_not? :eof
|
115
|
+
|
116
|
+
You need you have reached the end of the parse string when you receive
|
117
|
+
the :eof token. For instance
|
118
|
+
|
119
|
+
### Looping through tokens
|
120
|
+
A scanner instance is a ruby Enumerable, so you can use each, map, and
|
121
|
+
others.
|
122
|
+
|
123
|
+
@scanner.parse("123 456")
|
124
|
+
@scanner.map { |tok| "-#{tok.content}-" }
|
125
|
+
|
126
|
+
### Token separation
|
127
|
+
Sometimes it is necessary to indicate that a given token needs to be
|
128
|
+
followed by a token separator. For instance, in this example
|
129
|
+
|
130
|
+
token :number, '\d+'
|
131
|
+
token :id, '[a-z]+'
|
132
|
+
|
133
|
+
The string "abc123" will be parsed as an :id followed by a :number,
|
134
|
+
which may be undesirable. You may want to indicate that a token
|
135
|
+
separator (commonly spaces, arithmetic operators, puntuation marks,
|
136
|
+
etc) needs to occur after :id or :number.
|
137
|
+
|
138
|
+
The following code requires a space after ids and numbers:
|
139
|
+
|
140
|
+
token :number, '\d+', check_for_token_separator: true
|
141
|
+
token :id, '[a-z]+', check_for_token_separator: true
|
142
|
+
token_separator '\s'
|
143
|
+
|
144
|
+
### Looking ahead for token types
|
145
|
+
When scanning strings, it is often necessary to lookahead to check what
|
146
|
+
types of tokens are coming. For instance:
|
147
|
+
|
148
|
+
if @scanner.lookahead.is?(:id) && @scanner.lookahead(2).is(:equal)
|
149
|
+
# variable assignment
|
150
|
+
|
151
|
+
Scanner provides a few utility functions to make this type of check
|
152
|
+
easier. For instance, the previous code could be refactored to:
|
153
|
+
|
154
|
+
if @scanner.tokens_are?(:id, :equal)
|
155
|
+
|
156
|
+
The other two methods available are token_is? and token_is_not?.
|
157
|
+
|
158
|
+
### Tokens
|
159
|
+
The tokens returned by consume and lookahead have a few methods, which
|
160
|
+
should be self explanatory:
|
161
|
+
|
162
|
+
* content
|
163
|
+
* line
|
164
|
+
* column
|
165
|
+
* is? => Checks that the token is of a given type
|
166
|
+
* is_not? => The opposite
|
40
167
|
|
41
168
|
## Contributing
|
42
169
|
|
data/lib/scanner/scanner.rb
CHANGED
@@ -50,38 +50,38 @@ module Scanner
|
|
50
50
|
|
51
51
|
def check_for_token_separator
|
52
52
|
self.class.instance_eval { @check_for_token_separator }
|
53
|
-
end
|
53
|
+
end
|
54
54
|
|
55
55
|
def separator
|
56
56
|
self.class.instance_eval { @separator }
|
57
|
-
end
|
57
|
+
end
|
58
58
|
|
59
59
|
public
|
60
60
|
|
61
|
+
include Enumerable
|
62
|
+
|
61
63
|
def parse(program)
|
62
64
|
@program = program
|
63
65
|
@token_list = []
|
64
66
|
@line_number = 1
|
65
67
|
@column_number = 1
|
68
|
+
@token_number = 0
|
66
69
|
end
|
67
70
|
|
68
71
|
def consume
|
69
|
-
if @token_list.
|
72
|
+
if @token_number >= @token_list.size
|
70
73
|
consume_next_token
|
71
|
-
else
|
72
|
-
@token_list.shift
|
73
74
|
end
|
75
|
+
token = @token_list[@token_number]
|
76
|
+
@token_number+=1
|
77
|
+
token
|
74
78
|
end
|
75
79
|
|
76
80
|
def look_ahead(number_of_tokens = 1)
|
77
|
-
|
78
|
-
|
79
|
-
throw :scanner_exception if end_of_file_met
|
80
|
-
token = consume_next_token
|
81
|
-
@token_list << token
|
82
|
-
end_of_file_met = token.is? :eof
|
81
|
+
while @token_list.size < @token_number + number_of_tokens
|
82
|
+
consume_next_token
|
83
83
|
end
|
84
|
-
@token_list[-1]
|
84
|
+
@token_list[@token_number + number_of_tokens - 1]
|
85
85
|
end
|
86
86
|
|
87
87
|
def token_is?(token_type)
|
@@ -101,6 +101,20 @@ module Scanner
|
|
101
101
|
return true
|
102
102
|
end
|
103
103
|
|
104
|
+
def each
|
105
|
+
local_index = 0
|
106
|
+
begin
|
107
|
+
if local_index >= @token_list.size
|
108
|
+
consume_next_token
|
109
|
+
end
|
110
|
+
current_token = @token_list[local_index]
|
111
|
+
if current_token.is_not? :eof
|
112
|
+
yield current_token
|
113
|
+
end
|
114
|
+
local_index += 1
|
115
|
+
end while current_token.is_not? :eof
|
116
|
+
end
|
117
|
+
|
104
118
|
private
|
105
119
|
|
106
120
|
|
@@ -114,7 +128,8 @@ module Scanner
|
|
114
128
|
if check_for_token_separator[symbol]
|
115
129
|
check_for_separator
|
116
130
|
end
|
117
|
-
|
131
|
+
@token_list << Token.new(token_type, content, @line_number, currently_at_column)
|
132
|
+
return
|
118
133
|
end
|
119
134
|
end
|
120
135
|
|
@@ -128,7 +143,7 @@ module Scanner
|
|
128
143
|
|
129
144
|
def get_token_from_reg_exp(reg_exp, symbol)
|
130
145
|
content = consume_regular_expression(reg_exp)
|
131
|
-
if keywords.include?
|
146
|
+
if keywords && keywords.include?(content)
|
132
147
|
token_type = content.to_sym
|
133
148
|
else
|
134
149
|
token_type = symbol
|
data/lib/scanner/version.rb
CHANGED
@@ -35,6 +35,21 @@ describe Scanner do
|
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
|
+
describe "has enumerable functions" do
|
39
|
+
it "has each" do
|
40
|
+
@scanner.parse("123 456")
|
41
|
+
@scanner.each do |tok|
|
42
|
+
tok.content.should match /123|456/
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it "has map" do
|
47
|
+
@scanner.parse("123 456")
|
48
|
+
map_results = @scanner.map { |tok| "-#{tok.content}-" }
|
49
|
+
map_results.should eq ["-123-","-456-"]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
38
53
|
describe "lookahead" do
|
39
54
|
it "returns the next token without arguments" do
|
40
55
|
@scanner.parse("123")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scanner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -77,7 +77,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
77
77
|
version: '0'
|
78
78
|
segments:
|
79
79
|
- 0
|
80
|
-
hash:
|
80
|
+
hash: -2266866493885490648
|
81
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
82
|
none: false
|
83
83
|
requirements:
|
@@ -86,7 +86,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
86
86
|
version: '0'
|
87
87
|
segments:
|
88
88
|
- 0
|
89
|
-
hash:
|
89
|
+
hash: -2266866493885490648
|
90
90
|
requirements: []
|
91
91
|
rubyforge_project:
|
92
92
|
rubygems_version: 1.8.24
|