skeem 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +3 -0
- data/lib/skeem/tokenizer.rb +24 -6
- data/lib/skeem/version.rb +1 -1
- data/spec/skeem/tokenizer_spec.rb +124 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4521e994b28e81e3fa1f8852bf096c2e375def5
|
4
|
+
data.tar.gz: 86e7e0738987b88b9043740632b510e5bee6a49f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0197b9df68cba199a243b53d22f329728575ec2c8021b813f6f3eb4be4e39d530cfc53fa221be7994a636da06fdc1ad17977985d7d4e0c54dd3f4dda1c8b6cae
|
7
|
+
data.tar.gz: 493fb5dd6786dda700abf79ea211d96b832772540986b36eca887ddb2abba4816954080fb5508fdc55dc9ab4b375b5b129f031f507efcf9caf081e05b28ba10f
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
## [0.0.2] - 2018-08-25
|
2
|
+
### Changed
|
3
|
+
- Class`Tokenizer` improved, does recognize delimiters, booleans, integers, real numbers, strings, and identifiers.
|
4
|
+
- Spec file `Tokenizer_spec.rb` expanded with more tests.
|
5
|
+
|
1
6
|
## [0.0.1] - 2018-08-25
|
2
7
|
### Added
|
3
8
|
- Initial `Tokenizer` class commit
|
data/README.md
CHANGED
@@ -31,6 +31,9 @@ Roadmap:
|
|
31
31
|
- Make it pass all examples from the [Reasoned Schemer](https://mitpress.mit.edu/books/reasoned-schemer-second-edition) book.
|
32
32
|
|
33
33
|
TODO: Write usage instructions here
|
34
|
+
|
35
|
+
Good to know:
|
36
|
+
Online book: [The Scheme Programming Language (4th Ed.)](https://www.scheme.com/tspl4/)
|
34
37
|
|
35
38
|
## Development
|
36
39
|
|
data/lib/skeem/tokenizer.rb
CHANGED
@@ -30,7 +30,14 @@ module Skeem
|
|
30
30
|
# Constructor. Initialize a tokenizer for Skeem.
|
31
31
|
# @param source [String] Skeem text to tokenize.
|
32
32
|
def initialize(source)
|
33
|
-
@scanner = StringScanner.new(
|
33
|
+
@scanner = StringScanner.new('')
|
34
|
+
reinitialize(source)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# @param source [String] Skeem text to tokenize.
|
39
|
+
def reinitialize(source)
|
40
|
+
@scanner.string = source
|
34
41
|
@lineno = 1
|
35
42
|
@line_start = 0
|
36
43
|
end
|
@@ -58,19 +65,30 @@ module Skeem
|
|
58
65
|
if "()'`".include? curr_ch
|
59
66
|
# Delimiters, separators => single character token
|
60
67
|
token = build_token(@@lexeme2name[curr_ch], scanner.getch)
|
61
|
-
elsif (lexeme = scanner.scan(/#(
|
68
|
+
elsif (lexeme = scanner.scan(/#(?:\.)(?=\s|[|()";]|$)/)) # Single char occurring alone
|
69
|
+
token = build_token('PERIOD', lexeme)
|
70
|
+
elsif (lexeme = scanner.scan(/#(?:t|f|true|false)(?=\s|[|()";]|$)/))
|
62
71
|
token = build_token('BOOLEAN', lexeme) # normalized lexeme
|
63
|
-
elsif (lexeme = scanner.scan(/[0-9]+(
|
72
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+(?=\s|[|()";]|$)/))
|
64
73
|
token = build_token('INTEGER', lexeme) # Decimal radix
|
65
|
-
elsif (lexeme = scanner.scan(
|
74
|
+
elsif (lexeme = scanner.scan(/[+-]?[0-9]+\.[0-9]+(?:(?:e|E)[+-]?[0-9]+)?/))
|
66
75
|
token = build_token('REAL', lexeme)
|
67
76
|
elsif (lexeme = scanner.scan(/"(?:\\"|[^"])*"/)) # Double quotes literal?
|
68
77
|
unquoted = lexeme.gsub(/(^")|("$)/, '')
|
69
78
|
token = build_token('STRING_LIT', unquoted)
|
70
|
-
elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
|
71
|
-
token = build_token('IDENTIFIER', lexeme) # Plus and minus as identifiers
|
72
79
|
elsif (lexeme = scanner.scan(/[a-zA-Z!$%&*\/:<=>?@^_~][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
73
80
|
token = build_token('IDENTIFIER', lexeme)
|
81
|
+
elsif (lexeme = scanner.scan(/\|(?:[^|])*\|/)) # Vertical bar delimited
|
82
|
+
token = build_token('IDENTIFIER', lexeme)
|
83
|
+
elsif (lexeme = scanner.scan(/([\+\-])((?=\s|[|()";])|$)/))
|
84
|
+
# # R7RS peculiar identifiers case 1: isolated plus and minus as identifiers
|
85
|
+
token = build_token('IDENTIFIER', lexeme)
|
86
|
+
elsif (lexeme = scanner.scan(/[+-][a-zA-Z!$%&*\/:<=>?@^_~+-@][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
87
|
+
# R7RS peculiar identifiers case 2
|
88
|
+
token = build_token('IDENTIFIER', lexeme)
|
89
|
+
elsif (lexeme = scanner.scan(/\.[a-zA-Z!$%&*\/:<=>?@^_~+-@.][a-zA-Z0-9!$%&*+-.\/:<=>?@^_~+-]*/))
|
90
|
+
# R7RS peculiar identifiers case 4
|
91
|
+
token = build_token('IDENTIFIER', lexeme)
|
74
92
|
else # Unknown token
|
75
93
|
erroneous = curr_ch.nil? ? '' : scanner.scan(/./)
|
76
94
|
sequel = scanner.scan(/.{1,20}/)
|
data/lib/skeem/version.rb
CHANGED
@@ -10,7 +10,12 @@ module Skeem
|
|
10
10
|
expect(token.lexeme).to eq(lexeme)
|
11
11
|
end
|
12
12
|
end
|
13
|
+
|
14
|
+
def unquoted(aString)
|
15
|
+
aString.gsub(/(^")|("$)/, '')
|
16
|
+
end
|
13
17
|
|
18
|
+
# Default instantiation
|
14
19
|
subject { Tokenizer.new('') }
|
15
20
|
|
16
21
|
context 'Initialization:' do
|
@@ -21,10 +26,11 @@ module Skeem
|
|
21
26
|
it 'should have its scanner initialized' do
|
22
27
|
expect(subject.scanner).to be_kind_of(StringScanner)
|
23
28
|
end
|
24
|
-
|
29
|
+
end # context
|
30
|
+
|
25
31
|
context 'Delimiter and separator token recognition:' do
|
26
32
|
it 'should tokenize single char delimiters' do
|
27
|
-
subject.
|
33
|
+
subject.reinitialize("( ) ' `")
|
28
34
|
tokens = subject.tokens
|
29
35
|
tokens.each { |token| expect(token).to be_kind_of(SToken) }
|
30
36
|
terminals = tokens.map(&:terminal)
|
@@ -32,6 +38,122 @@ module Skeem
|
|
32
38
|
expect(terminals).to eq(prediction)
|
33
39
|
end
|
34
40
|
end # context
|
41
|
+
|
42
|
+
context 'Boolean literals recognition:' do
|
43
|
+
it 'should tokenize boolean constants' do
|
44
|
+
tests = [
|
45
|
+
# couple [raw input, expected]
|
46
|
+
['#t', '#t'],
|
47
|
+
[' #f', '#f'],
|
48
|
+
['#true ', '#true'],
|
49
|
+
[' #false', '#false']
|
50
|
+
]
|
51
|
+
|
52
|
+
tests.each do |(input, prediction)|
|
53
|
+
subject.reinitialize(input)
|
54
|
+
token = subject.tokens.first
|
55
|
+
expect(token.terminal).to eq('BOOLEAN')
|
56
|
+
expect(token.lexeme).to eq(prediction)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end # context
|
60
|
+
|
61
|
+
context 'Integer literals recognition:' do
|
62
|
+
it 'should tokenize integers in default radix 10' do
|
63
|
+
tests = [
|
64
|
+
# couple [raw input, expected]
|
65
|
+
['0', '0'],
|
66
|
+
[' 3', '3'],
|
67
|
+
['+3 ', '+3'],
|
68
|
+
['-3', '-3'],
|
69
|
+
['-1234', '-1234']
|
70
|
+
]
|
71
|
+
|
72
|
+
tests.each do |(input, prediction)|
|
73
|
+
subject.reinitialize(input)
|
74
|
+
token = subject.tokens.first
|
75
|
+
expect(token.terminal).to eq('INTEGER')
|
76
|
+
expect(token.lexeme).to eq(prediction)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end # context
|
80
|
+
|
81
|
+
context 'Real number recognition:' do
|
82
|
+
it 'should tokenize real numbers' do
|
83
|
+
tests = [
|
84
|
+
# couple [raw input, expected]
|
85
|
+
["\t\t3.45e+6", '3.45e+6'],
|
86
|
+
['+3.45e+6', '+3.45e+6'],
|
87
|
+
['-3.45e+6', '-3.45e+6']
|
88
|
+
]
|
89
|
+
|
90
|
+
tests.each do |(input, prediction)|
|
91
|
+
subject.reinitialize(input)
|
92
|
+
token = subject.tokens.first
|
93
|
+
expect(token.terminal).to eq('REAL')
|
94
|
+
expect(token.lexeme).to eq(prediction)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end # context
|
98
|
+
|
99
|
+
context 'String recognition:' do
|
100
|
+
it 'should tokenize strings' do
|
101
|
+
examples = [
|
102
|
+
# Some examples taken from R7RS document
|
103
|
+
'"Hello world!"',
|
104
|
+
'"The word \"recursion\" has many meanings."'
|
105
|
+
]
|
106
|
+
|
107
|
+
examples.each do |input|
|
108
|
+
# puts input
|
109
|
+
subject.reinitialize(input)
|
110
|
+
token = subject.tokens.first
|
111
|
+
expect(token.terminal).to eq('STRING_LIT')
|
112
|
+
expect(token.lexeme).to eq(unquoted(input))
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end # context
|
116
|
+
|
117
|
+
=begin
|
118
|
+
For later:
|
119
|
+
"Another example:\ntwo lines of text"
|
120
|
+
"Here's text \
|
121
|
+
containing just one line"
|
122
|
+
"\x03B1; is named GREEK SMALL LETTER ALPHA."
|
123
|
+
=end
|
124
|
+
|
125
|
+
context 'Identifier recognition:' do
|
126
|
+
it 'should tokenize identifier' do
|
127
|
+
examples = [
|
128
|
+
# Examples taken from R7RS document
|
129
|
+
'...', '+', '+soup+', '<=?',
|
130
|
+
'->string', 'a34kTMNs', 'lambda',
|
131
|
+
'list->vector', 'q', 'V17a',
|
132
|
+
'|two words|', '|two\x20;words|',
|
133
|
+
'the-word-recursion-has-many-meanings'
|
134
|
+
]
|
135
|
+
|
136
|
+
examples.each do |input|
|
137
|
+
subject.reinitialize(input)
|
138
|
+
token = subject.tokens.first
|
139
|
+
expect(token.terminal).to eq('IDENTIFIER')
|
140
|
+
expect(token.lexeme).to eq(input)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end # context
|
144
|
+
|
145
|
+
context 'Scanning Scheme sample code' do
|
146
|
+
it 'should read examples from lis.py page' do
|
147
|
+
source = <<-SCHEME
|
148
|
+
(if (> (val x) 0)
|
149
|
+
(fn (+ (aref A i) (* 3 i))
|
150
|
+
(quote (one two)))
|
151
|
+
end
|
152
|
+
end
|
153
|
+
SCHEME
|
154
|
+
subject.reinitialize(source)
|
155
|
+
expect { subject.tokens }.not_to raise_error
|
156
|
+
end
|
35
157
|
end # context
|
36
158
|
end # describe
|
37
159
|
end # module
|