parser 1.4.2 → 2.0.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -88,17 +88,20 @@ class Parser::Lexer
88
88
 
89
89
  attr_accessor :cond, :cmdarg
90
90
 
91
- attr_reader :comments
91
+ attr_accessor :tokens, :comments
92
92
 
93
93
  def initialize(version)
94
94
  @version = version
95
95
  @static_env = nil
96
96
 
97
+ @tokens = nil
98
+ @comments = nil
99
+
97
100
  reset
98
101
  end
99
102
 
100
103
  def reset(reset_state=true)
101
- # Ragel-related variables:
104
+ # Ragel state:
102
105
  if reset_state
103
106
  # Unit tests set state prior to resetting lexer.
104
107
  @cs = self.class.lex_en_line_begin
@@ -119,7 +122,8 @@ class Parser::Lexer
119
122
  @token_queue = []
120
123
  @literal_stack = []
121
124
 
122
- @comments = "" # collected comments
125
+ @eq_begin_s = nil # location of last encountered =begin
126
+ @sharp_s = nil # location of last encountered #
123
127
 
124
128
  @newline_s = nil # location of last encountered newline
125
129
 
@@ -150,7 +154,7 @@ class Parser::Lexer
150
154
  #
151
155
  # Patches accepted.
152
156
  #
153
- @source = @source_buffer.source + "\0\0\0"
157
+ @source = @source_buffer.source.gsub(/\r\n/, "\n") + "\0\0\0"
154
158
  else
155
159
  @source = nil
156
160
  end
@@ -212,14 +216,6 @@ class Parser::Lexer
212
216
  end
213
217
  end
214
218
 
215
- # Return the current collected comment block and clear the storage.
216
- def clear_comments
217
- comments = @comments
218
- @comments = ""
219
-
220
- comments
221
- end
222
-
223
219
  protected
224
220
 
225
221
  def eof_char?(char)
@@ -244,7 +240,13 @@ class Parser::Lexer
244
240
  end
245
241
 
246
242
  def emit(type, value = tok, s = @ts, e = @te)
247
- @token_queue << [ type, [ value, range(s, e) ] ]
243
+ token = [ type, [ value, range(s, e) ] ]
244
+
245
+ @token_queue.push(token)
246
+
247
+ @tokens.push(token) if @tokens
248
+
249
+ token
248
250
  end
249
251
 
250
252
  def emit_table(table, s = @ts, e = @te)
@@ -253,6 +255,18 @@ class Parser::Lexer
253
255
  emit(table[value], value, s, e)
254
256
  end
255
257
 
258
+ def emit_comment(s = @ts, e = @te)
259
+ if @comments
260
+ @comments.push(Parser::Source::Comment.new(range(s, e)))
261
+ end
262
+
263
+ if @tokens
264
+ @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
265
+ end
266
+
267
+ nil
268
+ end
269
+
256
270
  def diagnostic(type, message, location=range, highlights=[])
257
271
  @diagnostics.process(
258
272
  Parser::Diagnostic.new(type, message, location, highlights))
@@ -363,7 +377,7 @@ class Parser::Lexer
363
377
  @newline_s = p
364
378
  }
365
379
 
366
- c_nl = '\r'? '\n' $ do_nl;
380
+ c_nl = '\n' $ do_nl;
367
381
  c_space = [ \t\r\f\v];
368
382
  c_space_nl = c_space | c_nl;
369
383
  c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
@@ -486,21 +500,23 @@ class Parser::Lexer
486
500
  action unicode_points {
487
501
  @escape = ""
488
502
 
489
- codepoints = tok(@escape_s + 2, p - 1)
503
+ codepoints = tok(@escape_s + 2, p - 1)
504
+ codepoint_s = @escape_s + 2
505
+
490
506
  codepoints.split(/[ \t]/).each do |codepoint_str|
491
507
  codepoint = codepoint_str.to_i(16)
492
508
 
493
509
  if codepoint >= 0x110000
494
510
  @escape = lambda do
495
- # TODO better location reporting
496
511
  diagnostic :error, Parser::ERRORS[:unicode_point_too_large],
497
- range(@escape_s, p)
512
+ range(codepoint_s, codepoint_s + codepoint_str.length)
498
513
  end
499
514
 
500
515
  break
501
516
  end
502
517
 
503
518
  @escape += codepoint.chr(Encoding::UTF_8)
519
+ codepoint_s += codepoint_str.length + 1
504
520
  end
505
521
  }
506
522
 
@@ -886,6 +902,44 @@ class Parser::Lexer
886
902
  };
887
903
  *|;
888
904
 
905
+ #
906
+ # === WHITESPACE HANDLING ===
907
+ #
908
+
909
+ # Various contexts in Ruby allow various kinds of whitespace
910
+ # to be used. They are grouped to clarify the lexing machines
911
+ # and ease collection of comments.
912
+
913
+ # A line of code with inline #comment at end is always equivalent
914
+ # to a line of code ending with just a newline, so an inline
915
+ # comment is deemed equivalent to non-newline whitespace
916
+ # (c_space character class).
917
+
918
+ w_space =
919
+ c_space+
920
+ | '\\' e_heredoc_nl
921
+ | '#' %{ @sharp_s = p - 1 } c_line* %{ emit_comment(@sharp_s, p) }
922
+ ;
923
+
924
+ # A newline in non-literal context always interoperates with
925
+ # here document logic and can always be escaped by a backslash,
926
+ # still interoperating with here document logic in the same way,
927
+ # yet being invisible to anything else.
928
+ #
929
+ # To demonstrate:
930
+ #
931
+ # foo = <<FOO \
932
+ # bar
933
+ # FOO
934
+ # + 2
935
+ #
936
+ # is equivalent to `foo = "bar\n" + 2`.
937
+
938
+ w_newline = e_heredoc_nl;
939
+
940
+ w_space_newline = w_space | w_newline;
941
+
942
+
889
943
  #
890
944
  # === EXPRESSION PARSING ===
891
945
  #
@@ -1019,9 +1073,7 @@ class Parser::Lexer
1019
1073
  ':'
1020
1074
  => { fhold; fgoto expr_beg; };
1021
1075
 
1022
- # TODO whitespace rule
1023
- c_space;
1024
- e_heredoc_nl;
1076
+ w_space_newline;
1025
1077
 
1026
1078
  c_any
1027
1079
  => { fhold; fgoto expr_end; };
@@ -1037,10 +1089,9 @@ class Parser::Lexer
1037
1089
  expr_endfn := |*
1038
1090
  bareword ':'
1039
1091
  => { emit(:tLABEL, tok(@ts, @te - 1))
1040
- fnext expr_end; fbreak; };
1092
+ fnext expr_beg; fbreak; };
1041
1093
 
1042
- # TODO whitespace rule
1043
- c_space;
1094
+ w_space;
1044
1095
 
1045
1096
  c_any
1046
1097
  => { fhold; fgoto expr_end; };
@@ -1072,11 +1123,7 @@ class Parser::Lexer
1072
1123
  => { emit_table(PUNCTUATION)
1073
1124
  fnext expr_arg; fbreak; };
1074
1125
 
1075
- # TODO whitespace rule
1076
- c_space;
1077
- e_heredoc_nl;
1078
- '\\' e_heredoc_nl;
1079
- '#' c_line*;
1126
+ w_space_newline;
1080
1127
 
1081
1128
  c_any
1082
1129
  => { fhold; fgoto expr_end; };
@@ -1194,7 +1241,7 @@ class Parser::Lexer
1194
1241
  fgoto expr_end;
1195
1242
  };
1196
1243
 
1197
- c_space* ( '#' c_line* )? c_nl
1244
+ w_space? w_newline
1198
1245
  => { fhold; fgoto expr_end; };
1199
1246
 
1200
1247
  c_any
@@ -1227,7 +1274,7 @@ class Parser::Lexer
1227
1274
  => { emit(:kDO_BLOCK)
1228
1275
  fnext expr_value; };
1229
1276
 
1230
- c_space;
1277
+ w_space;
1231
1278
 
1232
1279
  c_any
1233
1280
  => { fhold; fgoto expr_end; };
@@ -1245,11 +1292,8 @@ class Parser::Lexer
1245
1292
  => { emit_table(KEYWORDS)
1246
1293
  fnext expr_beg; fbreak; };
1247
1294
 
1248
- # TODO whitespace rule
1249
- c_space;
1250
- '#' c_line*;
1251
-
1252
- e_heredoc_nl
1295
+ w_space;
1296
+ w_newline
1253
1297
  => { fhold; fgoto expr_end; };
1254
1298
 
1255
1299
  c_any
@@ -1487,14 +1531,7 @@ class Parser::Lexer
1487
1531
  # WHITESPACE
1488
1532
  #
1489
1533
 
1490
- # TODO whitespace rule
1491
- c_space;
1492
- e_heredoc_nl;
1493
- '\\' e_heredoc_nl;
1494
-
1495
- '#' c_line* c_eol
1496
- => { @comments << tok
1497
- fhold; };
1534
+ w_space_newline;
1498
1535
 
1499
1536
  e_heredoc_nl '=begin' ( c_space | c_eol )
1500
1537
  => { p = @ts - 1
@@ -1523,8 +1560,7 @@ class Parser::Lexer
1523
1560
  => { p = @ts - 1
1524
1561
  fgoto expr_end; };
1525
1562
 
1526
- # TODO whitespace rule
1527
- c_space;
1563
+ w_space;
1528
1564
 
1529
1565
  c_any
1530
1566
  => { fhold; fgoto expr_beg; };
@@ -1779,8 +1815,13 @@ class Parser::Lexer
1779
1815
  # WHITESPACE
1780
1816
  #
1781
1817
 
1782
- # TODO whitespace rule
1783
- '\\' e_heredoc_nl;
1818
+ w_space;
1819
+ w_newline
1820
+ => { fgoto leading_dot; };
1821
+
1822
+ ';'
1823
+ => { emit_table(PUNCTUATION)
1824
+ fnext expr_value; fbreak; };
1784
1825
 
1785
1826
  '\\' c_line {
1786
1827
  diagnostic :error, Parser::ERRORS[:bare_backslash],
@@ -1788,18 +1829,6 @@ class Parser::Lexer
1788
1829
  fhold;
1789
1830
  };
1790
1831
 
1791
- c_space+;
1792
-
1793
- '#' c_line*
1794
- => { @comments << tok(@ts, @te + 1) };
1795
-
1796
- e_heredoc_nl
1797
- => { fgoto leading_dot; };
1798
-
1799
- ';'
1800
- => { emit_table(PUNCTUATION)
1801
- fnext expr_value; fbreak; };
1802
-
1803
1832
  c_any
1804
1833
  => {
1805
1834
  message = Parser::ERRORS[:unexpected] % { :character => tok.inspect[1..-2] }
@@ -1827,30 +1856,26 @@ class Parser::Lexer
1827
1856
  #
1828
1857
 
1829
1858
  line_comment := |*
1830
- '=end' c_line* c_nl
1831
- => { @comments << tok
1832
- fgoto line_begin; };
1859
+ '=end' c_line* c_nl?
1860
+ => {
1861
+ emit_comment(@eq_begin_s, @te)
1862
+ fgoto line_begin;
1863
+ };
1833
1864
 
1834
- c_line* c_nl
1835
- => { @comments << tok };
1865
+ c_any;
1836
1866
 
1837
1867
  c_eof
1838
1868
  => {
1839
- # TODO better location information here
1840
- diagnostic :fatal, Parser::ERRORS[:embedded_document], range(p - 1, p)
1869
+ diagnostic :fatal, Parser::ERRORS[:embedded_document],
1870
+ range(@eq_begin_s, @eq_begin_s + '=begin'.length)
1841
1871
  };
1842
1872
  *|;
1843
1873
 
1844
1874
  line_begin := |*
1845
- # TODO whitespace rule
1846
- c_space_nl+;
1847
-
1848
- '#' c_line* c_eol
1849
- => { @comments << tok
1850
- fhold; };
1875
+ w_space_newline;
1851
1876
 
1852
1877
  '=begin' ( c_space | c_eol )
1853
- => { @comments << tok(@ts, @te)
1878
+ => { @eq_begin_s = @ts
1854
1879
  fgoto line_comment; };
1855
1880
 
1856
1881
  '__END__' c_eol
@@ -2,19 +2,10 @@ module Parser
2
2
 
3
3
  module Lexer::Explanation
4
4
 
5
- def self.included(klass)
6
- klass.class_exec do
7
- alias_method :state_before_explanation=, :state=
8
- alias_method :advance_before_explanation, :advance
9
-
10
- remove_method :state=, :advance
11
- end
12
- end
13
-
14
5
  # Like #advance, but also pretty-print the token and its position
15
6
  # in the stream to `stdout`.
16
7
  def advance
17
- type, (val, range) = advance_before_explanation
8
+ type, (val, range) = super
18
9
 
19
10
  puts decorate(range,
20
11
  "\e[0;32m#{type} #{val.inspect}\e[0m",
@@ -27,7 +18,7 @@ module Parser
27
18
  puts " \e[1;33m>>> STATE SET <<<\e[0m " +
28
19
  "#{new_state.to_s.ljust(12)} #{@cond} #{@cmdarg}".rjust(66)
29
20
 
30
- self.state_before_explanation = new_state
21
+ super
31
22
  end
32
23
 
33
24
  private
@@ -11,8 +11,7 @@ module Parser
11
11
  private
12
12
 
13
13
  def assignment?(node)
14
- [:lvasgn, :ivasgn, :gvasgn,
15
- :cvasgn, :cvdecl, :cdecl].include?(node.type)
14
+ [:lvasgn, :ivasgn, :gvasgn, :cvasgn, :casgn].include?(node.type)
16
15
  end
17
16
 
18
17
  def remove(range)
@@ -1155,7 +1155,6 @@ rule
1155
1155
  diagnostic(:error, :class_in_def, val[0])
1156
1156
  end
1157
1157
 
1158
- @comments.push @lexer.clear_comments
1159
1158
  @static_env.extend_static
1160
1159
  }
1161
1160
  bodystmt kEND
@@ -1166,7 +1165,6 @@ rule
1166
1165
  val[4], val[5])
1167
1166
 
1168
1167
  @static_env.unextend
1169
- @lexer.clear_comments
1170
1168
  }
1171
1169
  | kCLASS tLSHFT expr term
1172
1170
  {
@@ -1181,7 +1179,6 @@ rule
1181
1179
  val[5], val[6])
1182
1180
 
1183
1181
  @static_env.unextend
1184
- @lexer.clear_comments
1185
1182
 
1186
1183
  @def_level = val[4]
1187
1184
  }
@@ -1191,7 +1188,6 @@ rule
1191
1188
  diagnostic(:error, :module_in_def, val[0])
1192
1189
  end
1193
1190
 
1194
- @comments.push @lexer.clear_comments
1195
1191
  @static_env.extend_static
1196
1192
  }
1197
1193
  bodystmt kEND
@@ -1200,26 +1196,22 @@ rule
1200
1196
  val[3], val[4])
1201
1197
 
1202
1198
  @static_env.unextend
1203
- @lexer.clear_comments
1204
1199
  }
1205
1200
  | kDEF fname
1206
1201
  {
1207
- @comments.push @lexer.clear_comments
1208
1202
  @def_level += 1
1209
1203
  @static_env.extend_static
1210
1204
  }
1211
1205
  f_arglist bodystmt kEND
1212
1206
  {
1213
1207
  result = @builder.def_method(val[0], val[1],
1214
- val[3], val[4], val[5], @comments.pop)
1208
+ val[3], val[4], val[5])
1215
1209
 
1216
1210
  @static_env.unextend
1217
1211
  @def_level -= 1
1218
- @lexer.clear_comments
1219
1212
  }
1220
1213
  | kDEF singleton dot_or_colon
1221
1214
  {
1222
- @comments.push @lexer.clear_comments
1223
1215
  @lexer.state = :expr_fname
1224
1216
  }
1225
1217
  fname
@@ -1230,11 +1222,10 @@ rule
1230
1222
  f_arglist bodystmt kEND
1231
1223
  {
1232
1224
  result = @builder.def_singleton(val[0], val[1], val[2],
1233
- val[4], val[6], val[7], val[8], @comments.pop)
1225
+ val[4], val[6], val[7], val[8])
1234
1226
 
1235
1227
  @static_env.unextend
1236
1228
  @def_level -= 1
1237
- @lexer.clear_comments
1238
1229
  }
1239
1230
  | kBREAK
1240
1231
  {
@@ -1112,7 +1112,6 @@ rule
1112
1112
  diagnostic(:error, :class_in_def, val[0])
1113
1113
  end
1114
1114
 
1115
- @comments.push @lexer.clear_comments
1116
1115
  @static_env.extend_static
1117
1116
  }
1118
1117
  bodystmt kEND
@@ -1123,7 +1122,6 @@ rule
1123
1122
  val[4], val[5])
1124
1123
 
1125
1124
  @static_env.unextend
1126
- @lexer.clear_comments
1127
1125
  }
1128
1126
  | kCLASS tLSHFT expr term
1129
1127
  {
@@ -1138,7 +1136,6 @@ rule
1138
1136
  val[5], val[6])
1139
1137
 
1140
1138
  @static_env.unextend
1141
- @lexer.clear_comments
1142
1139
 
1143
1140
  @def_level = val[4]
1144
1141
  }
@@ -1148,7 +1145,6 @@ rule
1148
1145
  diagnostic(:error, :module_in_def, val[0])
1149
1146
  end
1150
1147
 
1151
- @comments.push @lexer.clear_comments
1152
1148
  @static_env.extend_static
1153
1149
  }
1154
1150
  bodystmt kEND
@@ -1157,26 +1153,22 @@ rule
1157
1153
  val[3], val[4])
1158
1154
 
1159
1155
  @static_env.unextend
1160
- @lexer.clear_comments
1161
1156
  }
1162
1157
  | kDEF fname
1163
1158
  {
1164
- @comments.push @lexer.clear_comments
1165
1159
  @def_level += 1
1166
1160
  @static_env.extend_static
1167
1161
  }
1168
1162
  f_arglist bodystmt kEND
1169
1163
  {
1170
1164
  result = @builder.def_method(val[0], val[1],
1171
- val[3], val[4], val[5], @comments.pop)
1165
+ val[3], val[4], val[5])
1172
1166
 
1173
1167
  @static_env.unextend
1174
1168
  @def_level -= 1
1175
- @lexer.clear_comments
1176
1169
  }
1177
1170
  | kDEF singleton dot_or_colon
1178
1171
  {
1179
- @comments.push @lexer.clear_comments
1180
1172
  @lexer.state = :expr_fname
1181
1173
  }
1182
1174
  fname
@@ -1187,11 +1179,10 @@ rule
1187
1179
  f_arglist bodystmt kEND
1188
1180
  {
1189
1181
  result = @builder.def_singleton(val[0], val[1], val[2],
1190
- val[4], val[6], val[7], val[8], @comments.pop)
1182
+ val[4], val[6], val[7], val[8])
1191
1183
 
1192
1184
  @static_env.unextend
1193
1185
  @def_level -= 1
1194
- @lexer.clear_comments
1195
1186
  }
1196
1187
  | kBREAK
1197
1188
  {
@@ -2103,9 +2094,7 @@ keyword_variable: kNIL
2103
2094
  }
2104
2095
  | tLABEL arg_value
2105
2096
  {
2106
- # TODO: Extract colon
2107
- key = @builder.symbol(val[0])
2108
- result = @builder.pair(key, nil, val[1])
2097
+ result = @builder.pair_keyword(val[0], val[1])
2109
2098
  }
2110
2099
 
2111
2100
  operation: tIDENTIFIER | tCONSTANT | tFID