rucc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +55 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +5 -0
  5. data/Gemfile +6 -0
  6. data/Gemfile.lock +46 -0
  7. data/LICENCE +21 -0
  8. data/README.md +82 -0
  9. data/Rakefile +2 -0
  10. data/Vagrantfile +10 -0
  11. data/bin/console +10 -0
  12. data/bin/rspec +2 -0
  13. data/bin/setup +8 -0
  14. data/exe/rucc +7 -0
  15. data/include/8cc.h +48 -0
  16. data/include/float.h +44 -0
  17. data/include/iso646.h +20 -0
  18. data/include/rucc.h +2 -0
  19. data/include/stdalign.h +11 -0
  20. data/include/stdarg.h +52 -0
  21. data/include/stdbool.h +11 -0
  22. data/include/stddef.h +15 -0
  23. data/include/stdnoreturn.h +8 -0
  24. data/lib/rucc.rb +8 -0
  25. data/lib/rucc/case.rb +22 -0
  26. data/lib/rucc/decl.rb +9 -0
  27. data/lib/rucc/enc.rb +9 -0
  28. data/lib/rucc/engine.rb +138 -0
  29. data/lib/rucc/file_io.rb +108 -0
  30. data/lib/rucc/file_io_list.rb +56 -0
  31. data/lib/rucc/gen.rb +1602 -0
  32. data/lib/rucc/int_evaluator.rb +114 -0
  33. data/lib/rucc/k.rb +73 -0
  34. data/lib/rucc/keyword.rb +17 -0
  35. data/lib/rucc/kind.rb +43 -0
  36. data/lib/rucc/label_gen.rb +13 -0
  37. data/lib/rucc/lexer.rb +40 -0
  38. data/lib/rucc/lexer/impl.rb +683 -0
  39. data/lib/rucc/lexer/preprocessor.rb +888 -0
  40. data/lib/rucc/lexer/preprocessor/cond_incl.rb +27 -0
  41. data/lib/rucc/lexer/preprocessor/constructor.rb +54 -0
  42. data/lib/rucc/lexer/preprocessor/pragma.rb +31 -0
  43. data/lib/rucc/lexer/preprocessor/special_macro.rb +110 -0
  44. data/lib/rucc/libc.rb +47 -0
  45. data/lib/rucc/m.rb +7 -0
  46. data/lib/rucc/macro.rb +24 -0
  47. data/lib/rucc/node.rb +530 -0
  48. data/lib/rucc/node/conv.rb +33 -0
  49. data/lib/rucc/op.rb +61 -0
  50. data/lib/rucc/operator.rb +13 -0
  51. data/lib/rucc/option.rb +30 -0
  52. data/lib/rucc/parser.rb +961 -0
  53. data/lib/rucc/parser/break.rb +18 -0
  54. data/lib/rucc/parser/builtin.rb +25 -0
  55. data/lib/rucc/parser/continue.rb +18 -0
  56. data/lib/rucc/parser/do.rb +33 -0
  57. data/lib/rucc/parser/ensure.rb +39 -0
  58. data/lib/rucc/parser/enum.rb +64 -0
  59. data/lib/rucc/parser/expr.rb +493 -0
  60. data/lib/rucc/parser/for.rb +71 -0
  61. data/lib/rucc/parser/func.rb +274 -0
  62. data/lib/rucc/parser/func_call.rb +54 -0
  63. data/lib/rucc/parser/goto.rb +29 -0
  64. data/lib/rucc/parser/if.rb +23 -0
  65. data/lib/rucc/parser/initializer.rb +237 -0
  66. data/lib/rucc/parser/label.rb +31 -0
  67. data/lib/rucc/parser/return.rb +16 -0
  68. data/lib/rucc/parser/struct_and_union.rb +280 -0
  69. data/lib/rucc/parser/switch.rb +117 -0
  70. data/lib/rucc/parser/while.rb +29 -0
  71. data/lib/rucc/pos.rb +11 -0
  72. data/lib/rucc/rmap.rb +22 -0
  73. data/lib/rucc/s.rb +9 -0
  74. data/lib/rucc/static_label_gen.rb +15 -0
  75. data/lib/rucc/t.rb +18 -0
  76. data/lib/rucc/tempname_gen.rb +14 -0
  77. data/lib/rucc/token.rb +114 -0
  78. data/lib/rucc/token_gen.rb +68 -0
  79. data/lib/rucc/type.rb +304 -0
  80. data/lib/rucc/type/check.rb +39 -0
  81. data/lib/rucc/type/conv.rb +29 -0
  82. data/lib/rucc/type_info.rb +21 -0
  83. data/lib/rucc/utf.rb +126 -0
  84. data/lib/rucc/util.rb +111 -0
  85. data/lib/rucc/version.rb +3 -0
  86. data/rucc.gemspec +38 -0
  87. metadata +201 -0
@@ -0,0 +1,114 @@
1
+ module Rucc
2
+ class IntEvaluator
3
+ class << self
4
+ # @param [Node] node
5
+ # @return [<Integer, (Node, NilClass)>]
6
+ def eval(node)
7
+ i, addr = do_eval(node)
8
+
9
+ case i
10
+ when TrueClass
11
+ r = 1
12
+ when FalseClass
13
+ r = 0
14
+ when Integer
15
+ r = i
16
+ else
17
+ raise "Integer expression expected, but got #{node}"
18
+ end
19
+
20
+ return r, addr
21
+ end
22
+
23
+ private
24
+
25
+ # @param [Node] node
26
+ # @return [<Integer, (Node, NilClass)>]
27
+ def do_eval(node)
28
+ case node.kind
29
+ when AST::LITERAL
30
+ if Type.is_inttype(node.ty)
31
+ return [node.ival, nil]
32
+ end
33
+ raise_error(node)
34
+ when '!'
35
+ i, addr = self.eval(node.operand)
36
+ r = (i == 0) ? 1 : 0
37
+ return [r, addr]
38
+ when '~'
39
+ r, addr = self.eval(node.operand)
40
+ return [~r, addr]
41
+ when OP::CAST then return self.eval(node.operand)
42
+ when AST::CONV then return self.eval(node.operand)
43
+ when AST::ADDR
44
+ if node.operand.kind == AST::STRUCT_REF
45
+ return eval_struct_ref(node.operand, 0)
46
+ end
47
+ return 0, Node.conv(node)
48
+ when AST::GVAR
49
+ return 0, Node.conv(node)
50
+ when AST::DEREF
51
+ if node.operand.ty.kind == Kind::PTR
52
+ return self.eval(node.operand)
53
+ end
54
+ raise_error!(node)
55
+ when AST::TERNARY
56
+ cond, addr = self.eval(node.cond)
57
+ if cond
58
+ return node.thn ? self.eval(node.thn) : [cond, addr]
59
+ end
60
+ return self.eval(node.els)
61
+ when '+' then return eval_binary_expr(node, &:+)
62
+ when '-' then return eval_binary_expr(node, &:-)
63
+ when '*' then return eval_binary_expr(node, &:*)
64
+ when '/' then return eval_binary_expr(node, &:/)
65
+ when '<' then return eval_binary_expr(node, &:<)
66
+ when '^' then return eval_binary_expr(node, &:^)
67
+ when '&' then return eval_binary_expr(node, &:&)
68
+ when '|' then return eval_binary_expr(node, &:|)
69
+ when '%' then return eval_binary_expr(node, &:%)
70
+ when OP::EQ then return eval_binary_expr(node, &:==)
71
+ when OP::GE then return eval_binary_expr(node, &:>=)
72
+ when OP::LE then return eval_binary_expr(node, &:<=)
73
+ when OP::NE then return eval_binary_expr(node, &:!=)
74
+ when OP::SAL then return eval_binary_expr(node, &:<<)
75
+ when OP::SAR then return eval_binary_expr(node, &:>>)
76
+ when OP::SHR then return eval_binary_expr(node, &:>>)
77
+ when OP::LOGAND then return eval_binary_expr(node) { |a, b| (a != 0) && (b != 0)} # [Integer] a, [Integer] b
78
+ when OP::LOGOR then return eval_binary_expr(node) { |a, b| (a != 0) || (b != 0)} # [Integer] a, [Integer] b
79
+ else
80
+ raise_error(node)
81
+ # error("Integer expression expected, but got %s", node2s(node));
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ # @param [Node] node
88
+ def raise_error(node)
89
+ raise "Integer expression expected, but got #{node}"
90
+ end
91
+
92
+ # @param [Node] node
93
+ # @return [<Integer, (Node, NilClass)>]
94
+ def eval_binary_expr(node, &block)
95
+ left, addr_l = self.eval(node.left)
96
+ right, addr_r = self.eval(node.right)
97
+ addr = addr_r || addr_l
98
+ r = yield(left, right)
99
+ return r, addr
100
+ end
101
+
102
+ # @param [Node] node
103
+ # @param [Integer] offset
104
+ # @return [<Integer, (Node, NilClass)>]
105
+ def eval_struct_ref(node, offset)
106
+ if node.kind == AST::STRUCT_REF
107
+ return eval_struct_ref(node.struct, node.ty.offset + offset)
108
+ end
109
+ n, addr = self.eval(node)
110
+ return n + offset, addr
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,73 @@
1
+ require "rucc/keyword"
2
+
3
+ module Rucc
4
+ module K
5
+ # Container of keyword
6
+ # @key [String]
7
+ # @value [Keyword]
8
+ @keywords = {}
9
+
10
+ class << self
11
+ def keywords
12
+ @keywords
13
+ end
14
+
15
+ private
16
+
17
+ # @param [Symbol] name keyword name
18
+ # @param [String] str String representation
19
+ # @param [Boolean] is_type
20
+ def keyword(name, str, is_type)
21
+ k = Keyword.new(str, is_type)
22
+ const_set(name, k)
23
+ @keywords[str] = k
24
+ end
25
+ end
26
+
27
+ keyword :ALIGNAS, "_Alignas", true
28
+ keyword :ALIGNOF, "_Alignof", false
29
+ keyword :AUTO, "auto", true
30
+ keyword :BOOL, "_Bool", true
31
+ keyword :BREAK, "break", false
32
+ keyword :CASE, "case", false
33
+ keyword :CHAR, "char", true
34
+ keyword :COMPLEX, "_Complex", true
35
+ keyword :CONST, "const", true
36
+ keyword :CONTINUE, "continue", false
37
+ keyword :DEFAULT, "default", false
38
+ keyword :DO, "do", false
39
+ keyword :DOUBLE, "double", true
40
+ keyword :ELSE, "else", false
41
+ keyword :ENUM, "enum", true
42
+ keyword :EXTERN, "extern", true
43
+ keyword :FLOAT, "float", true
44
+ keyword :FOR, "for", false
45
+ keyword :GENERIC, "_Generic", false
46
+ keyword :GOTO, "goto", false
47
+ keyword :IF, "if", false
48
+ keyword :IMAGINARY, "_Imaginary", true
49
+ keyword :INLINE, "inline", true
50
+ keyword :INT, "int", true
51
+ keyword :LONG, "long", true
52
+ keyword :NORETURN, "_Noreturn", true
53
+ keyword :REGISTER, "register", true
54
+ keyword :RESTRICT, "restrict", true
55
+ keyword :RETURN, "return", false
56
+ keyword :HASHHASH, "##", false
57
+ keyword :SHORT, "short", true
58
+ keyword :SIGNED, "signed", true
59
+ keyword :SIZEOF, "sizeof", false
60
+ keyword :STATIC, "static", true
61
+ keyword :STATIC_ASSERT, "_Static_assert", false
62
+ keyword :STRUCT, "struct", true
63
+ keyword :SWITCH, "switch", false
64
+ keyword :ELLIPSIS, "...", false
65
+ keyword :TYPEDEF, "typedef", true
66
+ keyword :TYPEOF, "typeof", true
67
+ keyword :UNION, "union", true
68
+ keyword :UNSIGNED, "unsigned", true
69
+ keyword :VOID, "void", true
70
+ keyword :VOLATILE, "volatile", true
71
+ keyword :WHILE, "while", false
72
+ end
73
+ end
@@ -0,0 +1,17 @@
1
+ module Rucc
2
+ class Keyword
3
+ def initialize(str, is_type)
4
+ @str = str
5
+ @is_type = is_type
6
+ end
7
+ attr_reader :str
8
+
9
+ def is_type?
10
+ @is_type
11
+ end
12
+
13
+ def to_s
14
+ @str
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,43 @@
1
+ module Rucc
2
+ class Kind
3
+ include Comparable
4
+
5
+ class << self
6
+ def def_kind(name, id)
7
+ const_set(name, self.new(id, name))
8
+ end
9
+ end
10
+
11
+ def initialize(id, name)
12
+ @id = id # Used only for <=>
13
+ @name = name
14
+ end
15
+ attr_reader :id
16
+
17
+ def <=>(other)
18
+ @id <=> other.id
19
+ end
20
+
21
+ def to_s
22
+ @name.to_s
23
+ end
24
+
25
+ def_kind :VOID, 0
26
+ def_kind :BOOL, 1
27
+ def_kind :CHAR, 2
28
+ def_kind :SHORT, 3
29
+ def_kind :INT, 4
30
+ def_kind :LONG, 5
31
+ def_kind :LLONG, 6
32
+ def_kind :FLOAT, 7
33
+ def_kind :DOUBLE, 8
34
+ def_kind :LDOUBLE, 9
35
+ def_kind :ARRAY, 10
36
+ def_kind :ENUM, 11
37
+ def_kind :PTR, 12
38
+ def_kind :STRUCT, 13
39
+ def_kind :FUNC, 14
40
+ # used only in parser
41
+ def_kind :STUB, 15
42
+ end
43
+ end
@@ -0,0 +1,13 @@
1
+ module Rucc
2
+ class LabelGen
3
+ def initialize
4
+ @label_id = 0 # [Integer]
5
+ end
6
+
7
+ def next
8
+ label = ".L#{@label_id}"
9
+ @label_id += 1
10
+ label
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,40 @@
1
+ require "rucc/libc"
2
+ require "rucc/token"
3
+ require "rucc/util"
4
+
5
+ require "rucc/lexer/impl"
6
+ require "rucc/lexer/preprocessor"
7
+
8
+ require "rucc/file_io"
9
+ require "rucc/file_io_list"
10
+
11
+ module Rucc
12
+ class Lexer
13
+ extend Forwardable
14
+
15
+ # TODO(south37) Impl file management as input
16
+ # @param [IO] input buffered io of C sourcecode.
17
+ # @param [String] filename
18
+ def initialize(input, filename)
19
+ @files = FileIOList.new(FileIO.new(input, filename))
20
+ @impl = Impl.new(@files)
21
+ @preprocessor = Preprocessor.new(@impl)
22
+ end
23
+
24
+ delegate [
25
+ :read_token,
26
+ :unget_token,
27
+ :peek_token,
28
+ :append_include_path,
29
+ :expr_reader=,
30
+ ] => :@preprocessor
31
+
32
+ delegate [
33
+ :stream_stash,
34
+ :stream_unstash,
35
+ ] => :@files
36
+
37
+ # NOTE: Used only for debug
38
+ delegate [:lex] => :@impl
39
+ end
40
+ end
@@ -0,0 +1,683 @@
1
+ require "forwardable"
2
+
3
+ require "rucc/file_io"
4
+ require "rucc/token_gen"
5
+ require "rucc/pos"
6
+
7
+ module Rucc
8
+ class Lexer
9
+ class Impl
10
+ extend Forwardable
11
+
12
+ # @param [FileIOList] files
13
+ def initialize(files)
14
+ @infile = files.first
15
+ @files = files
16
+
17
+ @buffers = [[]] # stack buffers to impl peek.
18
+ @token_gen = TokenGen.new(@files)
19
+ end
20
+ attr_reader :infile
21
+ delegate [:stream_depth] => :@files
22
+
23
+ # @param [Token] tok
24
+ def unget_token(tok)
25
+ return if tok.kind == T::EOF # Does not unget
26
+ buf = @buffers.last
27
+ buf.push(tok)
28
+ end
29
+
30
+ # @param [<Token>] tokens
31
+ def unget_all(tokens)
32
+ tokens.reverse.each { |token| unget_token(token) }
33
+ end
34
+
35
+ # @return [Token]
36
+ def lex
37
+ buf = @buffers.last
38
+ if buf.size > 0
39
+ return buf.pop
40
+ end
41
+
42
+ if @buffers.size > 1
43
+ return Token::EOF_TOKEN
44
+ end
45
+
46
+ bol = (current_file.column == 1)
47
+ tok = do_read_token
48
+ while tok.kind == T::SPACE
49
+ tok = do_read_token
50
+ tok.space = true
51
+ end
52
+ tok.bol = bol
53
+
54
+ # NOTE: only for debug
55
+ # if tok.kind == T::NEWLINE
56
+ # print "\n"
57
+ # else
58
+ # print " " if tok.space
59
+ # print tok
60
+ # # print current_file.name
61
+ # end
62
+
63
+ tok
64
+ end
65
+
66
+ # Reads a token from a given string.
67
+ # This function temporarily switches the main input stream to
68
+ # a given string and reads one token.
69
+ #
70
+ # @param [String] s
71
+ # @return [Token
72
+ def lex_string(s)
73
+ @files.stream_stash([FileIO.new(StringIO.new(s), "-")])
74
+ r = do_read_token
75
+ next?("\n")
76
+ p = get_pos(0)
77
+ if peek != nil # EOF
78
+ raise "#{p}: unconsumed input: #{s}"
79
+ # errorp(p, "unconsumed input: %s", s)
80
+ end
81
+ @files.stream_unstash
82
+ r
83
+ end
84
+
85
+ # Reads a header file name for #include.
86
+ #
87
+ # Filenames after #include need a special tokenization treatment.
88
+ # A filename string may be quoted by < and > instead of "".
89
+ # Even if it's quoted by "", it's still different from a regular string token.
90
+ # For example, \ in this context is not interpreted as a quote.
91
+ # Thus, we cannot use lex() to read a filename.
92
+ #
93
+ # That the C preprocessor requires a special lexer behavior only for
94
+ # #include is a violation of layering. Ideally, the lexer should be
95
+ # agnostic about higher layers status. But we need this for the C grammar.
96
+ #
97
+ # @return [<String, Boolean>, <NilClass, NilClass>]
98
+ def read_header_file_name
99
+ std = nil
100
+ if !buffer_empty?
101
+ return nil, std
102
+ end
103
+
104
+ skip_space!
105
+ p = get_pos(0)
106
+ if next?('"')
107
+ std = false
108
+ close = '"'
109
+ elsif next?('<')
110
+ std = true
111
+ close = '>'
112
+ else
113
+ return nil, std
114
+ end
115
+ b = ""
116
+ while !next?(close)
117
+ c = readc
118
+ if c.nil? || c == '\n'
119
+ raise "#{p}: premature end of header name"
120
+ # errorp(p, "premature end of header name");
121
+ end
122
+ b << c
123
+ end
124
+ if b.size == 0
125
+ raise "#{p}: header name should not be empty"
126
+ # errorp(p, "header name should not be empty");
127
+ end
128
+
129
+ return b, std
130
+ end
131
+
132
+ # @param [FileIO]
133
+ def push_file(file)
134
+ @files.push(file)
135
+ end
136
+
137
+ # Temporarily switches the input token stream to given list of tokens,
138
+ # so that you can get the tokens as return values of lex() again.
139
+ # After the tokens are exhausted, EOF is returned from lex() until
140
+ # "unstash" is called to restore the original state.
141
+ #
142
+ # @param [<Token>] buf
143
+ def token_buffer_stash(buf)
144
+ @buffers.push(buf)
145
+ end
146
+
147
+ def token_buffer_unstash
148
+ @buffers.pop
149
+ end
150
+
151
+ # Skips a block of code excluded from input by #if, #ifdef and the like.
152
+ # C11 6.10 says that code within #if and #endif needs to be a sequence of
153
+ # valid tokens even if skipped. However, in reality, most compilers don't
154
+ # tokenize nor validate contents. We don't do that, too.
155
+ # This function is to skip code until matching #endif as fast as we can.
156
+ def skip_cond_incl!
157
+ nest = 0
158
+ while true
159
+ bol = current_file.column == 1
160
+ skip_space!
161
+ c = readc
162
+ if c.nil? # EOF
163
+ return
164
+ end
165
+ if c == '\''
166
+ skip_char!
167
+ next
168
+ end
169
+ if c == '"'
170
+ skip_string!
171
+ next
172
+ end
173
+ if (c != '#' || !bol)
174
+ next
175
+ end
176
+ column = current_file.column - 1
177
+ tok = lex
178
+ if (tok.kind != T::IDENT)
179
+ next
180
+ end
181
+ if (nest == 0) && (Token.is_ident?(tok, "else") || Token.is_ident?(tok, "elif") || Token.is_ident?(tok, "endif"))
182
+ unget_token(tok)
183
+ hash = @token_gen.make_keyword('#')
184
+ hash.bol = true
185
+ hash.column = column
186
+ unget_token(hash)
187
+ return
188
+ end
189
+ if Token.is_ident?(tok, "if") || Token.is_ident?(tok, "ifdef") || Token.is_ident?(tok, "ifndef")
190
+ nest += 1
191
+ elsif (nest > 0) && Token.is_ident?(tok, "endif")
192
+ nest -= 1
193
+ end
194
+ skip_line!
195
+ end
196
+ end
197
+
198
+ # @return [FileIO]
199
+ def current_file
200
+ @files.current
201
+ end
202
+
203
+ private
204
+
205
+ # @return [Char, NilClass]
206
+ def readc
207
+ @files.readc
208
+ end
209
+
210
+ # @param [Char]
211
+ def unreadc(c)
212
+ @files.unreadc(c)
213
+ end
214
+
215
+ # Update current position
216
+ def mark!
217
+ @token_gen.pos = get_pos(0)
218
+ end
219
+
220
+ # @param [Integer] delta
221
+ # @return [Pos]
222
+ def get_pos(delta)
223
+ Pos.new(current_file.line, current_file.column + delta)
224
+ end
225
+
226
+ # @return [Boolean]
227
+ def buffer_empty?
228
+ @buffers.size == 1 && @buffers.first.size == 0
229
+ end
230
+
231
+ # @param [Char] c
232
+ # @return [Boolean]
233
+ def iswhitespace(c)
234
+ (c == ' ' || c == "\t" || c == "\f" || c == "\v")
235
+ end
236
+
237
+ def skip_block_comment!
238
+ # TODO(south37) Impl when necessary
239
+ # Pos p = get_pos(-2);
240
+ maybe_end = false
241
+ while true
242
+ c = readc
243
+ if c.nil?
244
+ raise "premature end of block comment"
245
+ # TODO(south37) Impl when necessary
246
+ # errorp(p, "premature end of block comment");
247
+ end
248
+ return if (c == '/' && maybe_end)
249
+ maybe_end = (c == '*')
250
+ end
251
+ raise "Must not reach here"
252
+ end
253
+
254
+ def skip_line!
255
+ while true
256
+ c = readc
257
+ return if c.nil?
258
+ if c == "\n"
259
+ unreadc(c)
260
+ return
261
+ end
262
+ end
263
+ raise "Must not reach here"
264
+ end
265
+
266
+ # Skips spaces including comments.
267
+ # Returns true if at least one space is skipped.
268
+ #
269
+ # @return [Boolean] true if skipped
270
+ def skip_space!
271
+ if !do_skip_space!
272
+ return false
273
+ end
274
+ while do_skip_space!; end
275
+ true
276
+ end
277
+
278
+ # @return [Boolean]
279
+ def do_skip_space!
280
+ c = readc
281
+ if c.nil? # EOF
282
+ return false
283
+ end
284
+ if iswhitespace(c)
285
+ return true;
286
+ end
287
+ if c == '/'
288
+ if next?('*')
289
+ skip_block_comment!
290
+ return true
291
+ end
292
+ if next?('/')
293
+ skip_line!
294
+ return true
295
+ end
296
+ end
297
+ unreadc(c)
298
+ false
299
+ end
300
+
301
+ def skip_char!
302
+ if readc == '\\'
303
+ readc
304
+ end
305
+ c = readc
306
+ while (!c.nil? && c != '\'')
307
+ c = readc
308
+ end
309
+ end
310
+
311
+ def skip_string!
312
+ c = readc
313
+ while (!c.nil? && c != '"')
314
+ if c == '\\'
315
+ readc
316
+ end
317
+ c = readc
318
+ end
319
+ end
320
+
321
+ # @param [Char] c
322
+ # @return [Boolean]
323
+ def next?(expect)
324
+ c = readc
325
+ return true if c == expect
326
+ unreadc(c)
327
+ false
328
+ end
329
+
330
+ # @param [Char] expect1
331
+ # @param [OP] t
332
+ # @param [Char] els
333
+ # @return [Token]
334
+ def read_rep(expect, t, els)
335
+ @token_gen.make_keyword(next?(expect) ? t : els)
336
+ end
337
+
338
+ # @param [Char] expect1
339
+ # @param [OP] t1
340
+ # @param [Char] expect2
341
+ # @param [OP] t2
342
+ # @param [Char] els
343
+ # @return [Token]
344
+ def read_rep2(expect1, t1, expect2, t2, els)
345
+ return @token_gen.make_keyword(t1) if next?(expect1)
346
+ return @token_gen.make_keyword(t2) if next?(expect2)
347
+ @token_gen.make_keyword(els)
348
+ end
349
+
350
+ # Reads a digraph starting with '%'. Digraphs are alternative spellings
351
+ # for some punctuation characters. They are useless in ASCII.
352
+ # We implement this just for the standard compliance.
353
+ # See C11 6.4.6p3 for the spec.
354
+ #
355
+ # @return [Token, NilClass]
356
+ def read_hash_digraph
357
+ if next?('>')
358
+ return @token_gen.make_keyword('}')
359
+ end
360
+ if next?(':')
361
+ if next?('%')
362
+ if next?(':')
363
+ return @token_gen.make_keyword(K::HASHHASH)
364
+ end
365
+ unreadc('%')
366
+ end
367
+ return @token_gen.make_keyword('#')
368
+ end
369
+ nil
370
+ end
371
+
372
+ def read_ident(c)
373
+ b = c.dup
374
+ while true
375
+ c = readc
376
+ if c && (Libc.isalnum(c) || ((c.ord & 0x80) > 0) || (c == '_') || (c == '$'))
377
+ b << c
378
+ next
379
+ end
380
+ # C11 6.4.2.1: \u or \U characters (universal-character-name)
381
+ # are allowed to be part of identifiers.
382
+ if c && (c == '\\' && (peek == 'u' || peek == 'U'))
383
+ escaped = read_escaped_char
384
+ UTF.write_utf8(b, escaped)
385
+ next
386
+ end
387
+ unreadc(c)
388
+ return @token_gen.make_ident(b)
389
+ end
390
+ raise "Must not reach here!"
391
+ end
392
+
393
+ # @param [ENC] enc
394
+ # @return [Token]
395
+ def read_char(enc)
396
+ c = readc
397
+ r = (c == '\\'.freeze) ? read_escaped_char : c.ord
398
+ c = readc
399
+ if c != "'".freeze
400
+ raise "unterminated char"
401
+ # errorp(pos, "unterminated char");
402
+ end
403
+ if enc == ENC::NONE
404
+ # NOTE: Only lower 8 bit has meaning
405
+ return @token_gen.make_char(0xFF & r, enc)
406
+ end
407
+ @token_gen.make_char(r, enc)
408
+ end
409
+
410
+ # @param [ENC]
411
+ # @return [Token]
412
+ def read_string(enc)
413
+ b = ""
414
+ while true
415
+ c = readc
416
+ if c.nil?
417
+ raise "unterminated string"
418
+ # TODO(south37) Impl errorp if necessary
419
+ # errorp(pos, "unterminated string");
420
+ end
421
+ if c == '"'
422
+ break
423
+ end
424
+ if c != '\\'
425
+ b << c
426
+ next
427
+ end
428
+ # Just after backslash escape
429
+ isucs = (peek == 'u' || peek == 'U')
430
+ c = read_escaped_char
431
+ if isucs
432
+ UTF.write_utf8(b, c)
433
+ next
434
+ end
435
+ b << c
436
+ end
437
+ @token_gen.make_strtok(b, enc)
438
+ end
439
+
440
+ # @return [Integer]
441
+ def read_escaped_char
442
+ # TODO(south37) Impl when necessary
443
+ # Pos p = get_pos(-1);
444
+ c = readc
445
+ case c
446
+ when '\'', '"', '?', '\\'
447
+ c.ord
448
+ when 'a'
449
+ return "\a".ord
450
+ when 'b'
451
+ return "\b".ord
452
+ when 'f'
453
+ return "\f".ord
454
+ when 'n'
455
+ return "\n".ord
456
+ when 'r'
457
+ return "\r".ord
458
+ when 't'
459
+ return "\t".ord
460
+ when 'v'
461
+ return "\v".ord
462
+ when 'e'
463
+ return "\e".ord # '\e' is GNU extension
464
+ when 'x'
465
+ return read_hex_char
466
+ when 'u'
467
+ return read_universal_char(4)
468
+ when 'U'
469
+ return read_universal_char(8)
470
+ when *'0'..'7'
471
+ return read_octal_char(c)
472
+ end
473
+ # TODO(south37) Impl when necessary
474
+ # warnp(p, "unknown escape character: \\%c", c);
475
+ c.ord
476
+ end
477
+
478
+ # Reads a number literal. Lexer's grammar on numbers is not strict.
479
+ # Integers and floating point numbers and different base numbers are not distinguished.
480
+ # @param [Char] c
481
+ def read_number(c)
482
+ b = c.dup
483
+ last = c
484
+ while true
485
+ c = readc
486
+ flonum = "eEpP".freeze.include?(last) && "+-".freeze.include?(c)
487
+ if !Libc.isdigit(c) && !Libc.isalpha(c) && c != '.' && !flonum
488
+ unreadc(c)
489
+ return @token_gen.make_number(b)
490
+ end
491
+ b << c
492
+ last = c
493
+ end
494
+ raise "Must not reach here"
495
+ end
496
+
497
+ # Reads a \x escape sequence.
498
+ #
499
+ # @return [Integer]
500
+ def read_hex_char
501
+ p = get_pos(-2)
502
+ c = readc
503
+ if !Libc.isxdigit(c)
504
+ raise "#{p}: \\x is not followed by a hexadecimal character: #{c}"
505
+ # errorp(p, "\\x is not followed by a hexadecimal character: %c", c);
506
+ end
507
+ r = 0
508
+ while true
509
+ case c
510
+ when '0' .. '9' then r = (r << 4) | (c.ord - '0'.ord)
511
+ when 'a' .. 'f' then r = (r << 4) | (c.ord - 'a'.ord + 10)
512
+ when 'A' .. 'F' then r = (r << 4) | (c.ord - 'A'.ord + 10)
513
+ else
514
+ unreadc(c)
515
+ return r
516
+ end
517
+ c = readc
518
+ end
519
+ end
520
+
521
+ # Reads \u or \U escape sequences. len is 4 or 8, respecitvely.
522
+ #
523
+ # @param [Integer] len
524
+ # @return [Integer]
525
+ def read_universal_char(len)
526
+ p = get_pos(-2)
527
+ r = 0
528
+ len.times do
529
+ c = readc
530
+ case c
531
+ when *'0'..'9' then r = (r << 4) | (c.ord - '0'.ord)
532
+ when *'a'..'f' then r = (r << 4) | (c.ord - 'a'.ord + 10)
533
+ when *'A'..'F' then r = (r << 4) | (c.ord - 'A'.ord + 10)
534
+ else
535
+ raise "#{p}: invalid universal character: #{c}"
536
+ # errorp(p, "invalid universal character: %c", c)
537
+ end
538
+ end
539
+ if !is_valid_ucn(r)
540
+ raise "#{p}: invalid universal character: \\#{(len == 4) ? 'u' : 'U'}#{format("%0#{len}d", r)}"
541
+ # errorp(p, "invalid universal character: \\%c%0*x", (len == 4) ? 'u' : 'U', len, r);
542
+ end
543
+ r
544
+ end
545
+
546
+ # @param [Integer] c
547
+ # @return [Boolean]
548
+ def is_valid_ucn(c)
549
+ # C11 6.4.3p2: U+D800 to U+DFFF are reserved for surrogate pairs.
550
+ # A codepoint within the range cannot be a valid character.
551
+ if (0xD800 <= c) && (c <= 0xDFFF)
552
+ return false
553
+ end
554
+ # It's not allowed to encode ASCII characters using \U or \u.
555
+ # Some characters not in the basic character set (C11 5.2.1p3)
556
+ # are allowed as exceptions.
557
+ (0xA0 <= c) || (c == '$'.ord) || (c == '@'.ord) || (c == '`'.ord)
558
+ end
559
+
560
+ # Reads an octal escape sequence.
561
+ #
562
+ # @param [Char] c
563
+ # @return [Integer]
564
+ def read_octal_char(c)
565
+ r = c.ord - '0'.ord
566
+ if !nextoct?
567
+ return r
568
+ end
569
+
570
+ r = (r << 3) | (readc.ord - '0'.ord)
571
+ if !nextoct?
572
+ return r
573
+ end
574
+
575
+ (r << 3) | (readc.ord - '0'.ord)
576
+ end
577
+
578
+ # @return [Boolean]
579
+ def nextoct?
580
+ ('0'..'7').include?(peek)
581
+ end
582
+
583
+ # @return [Token]
584
+ def do_read_token
585
+ if skip_space!
586
+ return Token::SPACE_TOKEN
587
+ end
588
+ mark!
589
+ c = readc
590
+ case c
591
+ when "\n"
592
+ return Token::NEWLINE_TOKEN
593
+ when ':'
594
+ return @token_gen.make_keyword(next?('>') ? ']' : ':')
595
+ when '#'
596
+ return @token_gen.make_keyword(next?('#') ? K::HASHHASH : '#')
597
+ when '+'
598
+ return read_rep2('+', OP::INC, '=', OP::A_ADD, '+')
599
+ when '*'
600
+ return read_rep('=', OP::A_MUL, '*')
601
+ when '='
602
+ return read_rep('=', OP::EQ, '=')
603
+ when '!'
604
+ return read_rep('=', OP::NE, '!')
605
+ when '&'
606
+ return read_rep2('&', OP::LOGAND, '=', OP::A_AND, '&');
607
+ when '|'
608
+ return read_rep2('|', OP::LOGOR, '=', OP::A_OR, '|');
609
+ when '^'
610
+ return read_rep('=', OP::A_XOR, '^')
611
+ when '"'
612
+ return read_string(ENC::NONE)
613
+ when '\''
614
+ return read_char(ENC::NONE)
615
+ when '/'
616
+ return @token_gen.make_keyword(next?('=') ? OP::A_DIV : '/');
617
+ when *'a'..'t', *'v'..'z', *'A'..'K', *'M'..'T', *'V'..'Z', '_', '$', *(0x80.chr..0xFD.chr)
618
+ return read_ident(c)
619
+ when *'0'..'9'
620
+ return read_number(c)
621
+ when 'L', 'U'
622
+ # NOTE: Wide/char32_t character/string literal
623
+ enc = (c == 'L') ? ENC::WCHAR : ENC::CHAR32
624
+ return read_string(enc) if next?('"')
625
+ return read_char(enc) if next?('\'')
626
+ return read_ident(c)
627
+ when 'u'
628
+ return read_string(ENC::CHAR16) if next?('"')
629
+ return read_char(ENC::CHAR16) if next?('\'')
630
+ # C11 6.4.5: UTF-8 string literal
631
+ if next?('8')
632
+ if next?('"')
633
+ return read_string(ENC::UTF8)
634
+ end
635
+ unreadc('8')
636
+ end
637
+ return read_ident(c)
638
+ when '.'
639
+ return read_number(c) if Libc.isdigit(peek)
640
+ if next?('.')
641
+ if next?('.')
642
+ return @token_gen.make_keyword(K::ELLIPSIS)
643
+ end
644
+ return @token_gen.make_ident('..')
645
+ end
646
+ return @token_gen.make_keyword('.')
647
+ when '(', ')', ',', ';', '[', ']', '{', '}', '?', '~'
648
+ return @token_gen.make_keyword(c)
649
+ when '-'
650
+ return @token_gen.make_keyword(OP::DEC) if next?('-')
651
+ return @token_gen.make_keyword(OP::ARROW) if next?('>')
652
+ return @token_gen.make_keyword(OP::A_SUB) if next?('=')
653
+ return @token_gen.make_keyword('-');
654
+ when '<'
655
+ return read_rep('=', OP::A_SAL, OP::SAL) if next?('<')
656
+ return @token_gen.make_keyword(OP::LE) if next?('=')
657
+ return @token_gen.make_keyword('[') if next?(':')
658
+ return @token_gen.make_keyword('{') if next?('%')
659
+ return @token_gen.make_keyword('<')
660
+ when '>'
661
+ return @token_gen.make_keyword(OP::GE) if next?('=')
662
+ return read_rep('=', OP::A_SAR, OP::SAR) if next?('>')
663
+ return @token_gen.make_keyword('>')
664
+ when '%'
665
+ tok = read_hash_digraph
666
+ return tok if tok
667
+ return read_rep('=', OP::A_MOD, '%')
668
+ when nil
669
+ return Token::EOF_TOKEN
670
+ else
671
+ return @token_gen.make_invalid(c.ord)
672
+ end
673
+ end
674
+
675
+ # @return [Char]
676
+ def peek
677
+ r = readc
678
+ unreadc(r)
679
+ r
680
+ end
681
+ end
682
+ end
683
+ end