skeem 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +3 -0
- data/lib/skeem/tokenizer.rb +24 -6
- data/lib/skeem/version.rb +1 -1
- data/spec/skeem/tokenizer_spec.rb +124 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4521e994b28e81e3fa1f8852bf096c2e375def5
|
4
|
+
data.tar.gz: 86e7e0738987b88b9043740632b510e5bee6a49f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0197b9df68cba199a243b53d22f329728575ec2c8021b813f6f3eb4be4e39d530cfc53fa221be7994a636da06fdc1ad17977985d7d4e0c54dd3f4dda1c8b6cae
|
7
|
+
data.tar.gz: 493fb5dd6786dda700abf79ea211d96b832772540986b36eca887ddb2abba4816954080fb5508fdc55dc9ab4b375b5b129f031f507efcf9caf081e05b28ba10f
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
## [0.0.2] - 2018-08-25
|
2
|
+
### Changed
|
3
|
+
- Class`Tokenizer` improved, does recognize delimiters, booleans, integers, real numbers, strings, and identifiers.
|
4
|
+
- Spec file `Tokenizer_spec.rb` expanded with more tests.
|
5
|
+
|
1
6
|
## [0.0.1] - 2018-08-25
|
2
7
|
### Added
|
3
8
|
- Initial `Tokenizer` class commit
|
data/README.md
CHANGED
@@ -31,6 +31,9 @@ Roadmap:
|
|
31
31
|
- Make it pass all examples from the [Reasoned Schemer](https://mitpress.mit.edu/books/reasoned-schemer-second-edition) book.
|
32
32
|
|
33
33
|
TODO: Write usage instructions here
|
34
|
+
|
35
|
+
Good to know:
|
36
|
+
Online book: [The Scheme Programming Language (4th Ed.)](https://www.scheme.com/tspl4/)
|
34
37
|
|
35
38
|
## Development
|
36
39
|
|
data/lib/skeem/tokenizer.rb
CHANGED
@@ -30,7 +30,14 @@ module Skeem
|
|
30
30
|
# Constructor. Initialize a tokenizer for Skeem.
|
31
31
|
# @param source [String] Skeem text to tokenize.
|
32
32
|
def initialize(source)
|
33
|
-
@scanner = StringScanner.new(
|
33
|
+
@scanner = StringScanner.new('')
|
34
|
+
reinitialize(source)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# @param source [String] Skeem text to tokenize.
|
39
|
+
def reinitialize(source)
|
40
|
+
@scanner.string = source
|
34
41
|
@lineno = 1
|
35
42
|
@line_start = 0
|
36
43
|
end
|
@@ -58,19 +65,30 @@ module Skeem
|
|
58
65
|
if "()'`".include? curr_ch
|
59
66
|
# Delimiters, separators => single character token
|
60
67
|
token = build_token(@@lexeme2name[curr_ch], scanner.getch)
|
61
|
-
elsif (lexeme = scanner.scan(/#(
|
68
|
+
elsif (lexeme = scanner.scan(/#(?:\.)(?=\s|[|()";]|$)/)) # Single char occurring alone
|
69
|
+
token = build_token('PERIOD', lexeme)
|
70
|
+
elsif (lexeme = scanner.scan(/#(?:t|f|true|false)(?=\s|[|()";]|$)/))
|
62
71
|
token = build_token('BOOLEAN', lexeme) # normalized lexeme
|
63
|
-
elsif (lexeme = scanner.scan(/[0-9]+(
|
72
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?=\s|[|()";]|$)/))
|
64
73
|
token = build_token('INTEGER', lexeme) # Decimal radix
|
65
|
-
elsif (lexeme = scanner.scan(
|
74
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+\.[0-9]+(?:(?:e|E)[+-]?[0-9]+)?/))
|
66
75
|
token = build_token('REAL', lexeme)
|
67
76
|
elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal?
|
68
77
|
unquoted = lexeme.gsub(/(^")|("$)/, '')
|
69
78
|
token = build_token('STRING_LIT', unquoted)
|
70
|
-
elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
|
71
|
-
token = build_token('IDENTIFIER', lexeme) # Plus and minus as identifiers
|
72
79
|
elsif (lexeme = scanner.scan(/[a-zA-Z!$%&*\/:<=>?@^_~][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
73
80
|
token = build_token('IDENTIFIER', lexeme)
|
81
|
+
elsif (lexeme = scanner.scan(/\|(?:[^|])*\|/)) # Vertical bar delimited
|
82
|
+
token = build_token('IDENTIFIER', lexeme)
|
83
|
+
elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
|
84
|
+
# # R7RS peculiar identifiers case 1: isolated plus and minus as identifiers
|
85
|
+
token = build_token('IDENTIFIER', lexeme)
|
86
|
+
elsif (lexeme = scanner.scan(/[+-][a-zA-Z!$%&*\/:<=>?@^_~+-@][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
87
|
+
# R7RS peculiar identifiers case 2
|
88
|
+
token = build_token('IDENTIFIER', lexeme)
|
89
|
+
elsif (lexeme = scanner.scan(/\.[a-zA-Z!$%&*\/:<=>?@^_~+-@.][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
90
|
+
# R7RS peculiar identifiers case 4
|
91
|
+
token = build_token('IDENTIFIER', lexeme)
|
74
92
|
else # Unknown token
|
75
93
|
erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
|
76
94
|
sequel = scanner.scan(/.{1,20}/)
|
data/lib/skeem/version.rb
CHANGED
@@ -10,7 +10,12 @@ module Skeem
|
|
10
10
|
expect(token.lexeme).to eq(lexeme)
|
11
11
|
end
|
12
12
|
end
|
13
|
+
|
14
|
+
def unquoted(aString)
|
15
|
+
aString.gsub(/(^")|("$)/, '')
|
16
|
+
end
|
13
17
|
|
18
|
+
# Default instantiation
|
14
19
|
subject { Tokenizer.new('') }
|
15
20
|
|
16
21
|
context 'Initialization:' do
|
@@ -21,10 +26,11 @@ module Skeem
|
|
21
26
|
it 'should have its scanner initialized' do
|
22
27
|
expect(subject.scanner).to be_kind_of(StringScanner)
|
23
28
|
end
|
24
|
-
|
29
|
+
end # context
|
30
|
+
|
25
31
|
context 'Delimiter and separator token recognition:' do
|
26
32
|
it 'should tokenize single char delimiters' do
|
27
|
-
subject.
|
33
|
+
subject.reinitialize("( ) ' `")
|
28
34
|
tokens = subject.tokens
|
29
35
|
tokens.each { |token| expect(token).to be_kind_of(SToken) }
|
30
36
|
terminals = tokens.map(&:terminal)
|
@@ -32,6 +38,122 @@ module Skeem
|
|
32
38
|
expect(terminals).to eq(prediction)
|
33
39
|
end
|
34
40
|
end # context
|
41
|
+
|
42
|
+
context 'Boolean literals recognition:' do
|
43
|
+
it 'should tokenize boolean constants' do
|
44
|
+
tests = [
|
45
|
+
# couple [raw input, expected]
|
46
|
+
['#t', '#t'],
|
47
|
+
[' #f', '#f'],
|
48
|
+
['#true ', '#true'],
|
49
|
+
[' #false', '#false']
|
50
|
+
]
|
51
|
+
|
52
|
+
tests.each do |(input, prediction)|
|
53
|
+
subject.reinitialize(input)
|
54
|
+
token = subject.tokens.first
|
55
|
+
expect(token.terminal).to eq('BOOLEAN')
|
56
|
+
expect(token.lexeme).to eq(prediction)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end # context
|
60
|
+
|
61
|
+
context 'Integer literals recognition:' do
|
62
|
+
it 'should tokenize integers in default radix 10' do
|
63
|
+
tests = [
|
64
|
+
# couple [raw input, expected]
|
65
|
+
['0', '0'],
|
66
|
+
[' 3', '3'],
|
67
|
+
['+3 ', '+3'],
|
68
|
+
['-3', '-3'],
|
69
|
+
['-1234', '-1234']
|
70
|
+
]
|
71
|
+
|
72
|
+
tests.each do |(input, prediction)|
|
73
|
+
subject.reinitialize(input)
|
74
|
+
token = subject.tokens.first
|
75
|
+
expect(token.terminal).to eq('INTEGER')
|
76
|
+
expect(token.lexeme).to eq(prediction)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end # context
|
80
|
+
|
81
|
+
context 'Real number recognition:' do
|
82
|
+
it 'should tokenize real numbers' do
|
83
|
+
tests = [
|
84
|
+
# couple [raw input, expected]
|
85
|
+
["\t\t3.45e+6", '3.45e+6'],
|
86
|
+
['+3.45e+6', '+3.45e+6'],
|
87
|
+
['-3.45e+6', '-3.45e+6']
|
88
|
+
]
|
89
|
+
|
90
|
+
tests.each do |(input, prediction)|
|
91
|
+
subject.reinitialize(input)
|
92
|
+
token = subject.tokens.first
|
93
|
+
expect(token.terminal).to eq('REAL')
|
94
|
+
expect(token.lexeme).to eq(prediction)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end # context
|
98
|
+
|
99
|
+
context 'String recognition:' do
|
100
|
+
it 'should tokenize strings' do
|
101
|
+
examples = [
|
102
|
+
# Some examples taken from R7RS document
|
103
|
+
'"Hello world!"',
|
104
|
+
'"The word \"recursion\" has many meanings."'
|
105
|
+
]
|
106
|
+
|
107
|
+
examples.each do |input|
|
108
|
+
# puts input
|
109
|
+
subject.reinitialize(input)
|
110
|
+
token = subject.tokens.first
|
111
|
+
expect(token.terminal).to eq('STRING_LIT')
|
112
|
+
expect(token.lexeme).to eq(unquoted(input))
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end # context
|
116
|
+
|
117
|
+
=begin
|
118
|
+
For later:
|
119
|
+
"Another example:\ntwo lines of text"
|
120
|
+
"Here's text \
|
121
|
+
containing just one line"
|
122
|
+
"\x03B1; is named GREEK SMALL LETTER ALPHA."
|
123
|
+
=end
|
124
|
+
|
125
|
+
context 'Identifier recognition:' do
|
126
|
+
it 'should tokenize identifier' do
|
127
|
+
examples = [
|
128
|
+
# Examples taken from R7RS document
|
129
|
+
'...', '+', '+soup+', '<=?',
|
130
|
+
'->string', 'a34kTMNs', 'lambda',
|
131
|
+
'list->vector', 'q', 'V17a',
|
132
|
+
'|two words|', '|two\x20;words|',
|
133
|
+
'the-word-recursion-has-many-meanings'
|
134
|
+
]
|
135
|
+
|
136
|
+
examples.each do |input|
|
137
|
+
subject.reinitialize(input)
|
138
|
+
token = subject.tokens.first
|
139
|
+
expect(token.terminal).to eq('IDENTIFIER')
|
140
|
+
expect(token.lexeme).to eq(input)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end # context
|
144
|
+
|
145
|
+
context 'Scanning Scheme sample code' do
|
146
|
+
it 'should read examples from lis.py page' do
|
147
|
+
source = <<-SCHEME
|
148
|
+
(if (> (val x) 0)
|
149
|
+
(fn (+ (aref A i) (* 3 i))
|
150
|
+
(quote (one two)))
|
151
|
+
end
|
152
|
+
end
|
153
|
+
SCHEME
|
154
|
+
subject.reinitialize(source)
|
155
|
+
expect { subject.tokens }.not_to raise_error
|
156
|
+
end
|
35
157
|
end # context
|
36
158
|
end # describe
|
37
159
|
end # module
|