parser 1.4.2 → 2.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -88,17 +88,20 @@ class Parser::Lexer
88
88
 
89
89
  attr_accessor :cond, :cmdarg
90
90
 
91
- attr_reader :comments
91
+ attr_accessor :tokens, :comments
92
92
 
93
93
  def initialize(version)
94
94
  @version = version
95
95
  @static_env = nil
96
96
 
97
+ @tokens = nil
98
+ @comments = nil
99
+
97
100
  reset
98
101
  end
99
102
 
100
103
  def reset(reset_state=true)
101
- # Ragel-related variables:
104
+ # Ragel state:
102
105
  if reset_state
103
106
  # Unit tests set state prior to resetting lexer.
104
107
  @cs = self.class.lex_en_line_begin
@@ -119,7 +122,8 @@ class Parser::Lexer
119
122
  @token_queue = []
120
123
  @literal_stack = []
121
124
 
122
- @comments = "" # collected comments
125
+ @eq_begin_s = nil # location of last encountered =begin
126
+ @sharp_s = nil # location of last encountered #
123
127
 
124
128
  @newline_s = nil # location of last encountered newline
125
129
 
@@ -150,7 +154,7 @@ class Parser::Lexer
150
154
  #
151
155
  # Patches accepted.
152
156
  #
153
- @source = @source_buffer.source + "\0\0\0"
157
+ @source = @source_buffer.source.gsub(/\r\n/, "\n") + "\0\0\0"
154
158
  else
155
159
  @source = nil
156
160
  end
@@ -212,14 +216,6 @@ class Parser::Lexer
212
216
  end
213
217
  end
214
218
 
215
- # Return the current collected comment block and clear the storage.
216
- def clear_comments
217
- comments = @comments
218
- @comments = ""
219
-
220
- comments
221
- end
222
-
223
219
  protected
224
220
 
225
221
  def eof_char?(char)
@@ -244,7 +240,13 @@ class Parser::Lexer
244
240
  end
245
241
 
246
242
  def emit(type, value = tok, s = @ts, e = @te)
247
- @token_queue << [ type, [ value, range(s, e) ] ]
243
+ token = [ type, [ value, range(s, e) ] ]
244
+
245
+ @token_queue.push(token)
246
+
247
+ @tokens.push(token) if @tokens
248
+
249
+ token
248
250
  end
249
251
 
250
252
  def emit_table(table, s = @ts, e = @te)
@@ -253,6 +255,18 @@ class Parser::Lexer
253
255
  emit(table[value], value, s, e)
254
256
  end
255
257
 
258
+ def emit_comment(s = @ts, e = @te)
259
+ if @comments
260
+ @comments.push(Parser::Source::Comment.new(range(s, e)))
261
+ end
262
+
263
+ if @tokens
264
+ @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
265
+ end
266
+
267
+ nil
268
+ end
269
+
256
270
  def diagnostic(type, message, location=range, highlights=[])
257
271
  @diagnostics.process(
258
272
  Parser::Diagnostic.new(type, message, location, highlights))
@@ -363,7 +377,7 @@ class Parser::Lexer
363
377
  @newline_s = p
364
378
  }
365
379
 
366
- c_nl = '\r'? '\n' $ do_nl;
380
+ c_nl = '\n' $ do_nl;
367
381
  c_space = [ \t\r\f\v];
368
382
  c_space_nl = c_space | c_nl;
369
383
  c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
@@ -486,21 +500,23 @@ class Parser::Lexer
486
500
  action unicode_points {
487
501
  @escape = ""
488
502
 
489
- codepoints = tok(@escape_s + 2, p - 1)
503
+ codepoints = tok(@escape_s + 2, p - 1)
504
+ codepoint_s = @escape_s + 2
505
+
490
506
  codepoints.split(/[ \t]/).each do |codepoint_str|
491
507
  codepoint = codepoint_str.to_i(16)
492
508
 
493
509
  if codepoint >= 0x110000
494
510
  @escape = lambda do
495
- # TODO better location reporting
496
511
  diagnostic :error, Parser::ERRORS[:unicode_point_too_large],
497
- range(@escape_s, p)
512
+ range(codepoint_s, codepoint_s + codepoint_str.length)
498
513
  end
499
514
 
500
515
  break
501
516
  end
502
517
 
503
518
  @escape += codepoint.chr(Encoding::UTF_8)
519
+ codepoint_s += codepoint_str.length + 1
504
520
  end
505
521
  }
506
522
 
@@ -886,6 +902,44 @@ class Parser::Lexer
886
902
  };
887
903
  *|;
888
904
 
905
+ #
906
+ # === WHITESPACE HANDLING ===
907
+ #
908
+
909
+ # Various contexts in Ruby allow various kinds of whitespace
910
+ # to be used. They are grouped to clarify the lexing machines
911
+ # and ease collection of comments.
912
+
913
+ # A line of code with inline #comment at end is always equivalent
914
+ # to a line of code ending with just a newline, so an inline
915
+ # comment is deemed equivalent to non-newline whitespace
916
+ # (c_space character class).
917
+
918
+ w_space =
919
+ c_space+
920
+ | '\\' e_heredoc_nl
921
+ | '#' %{ @sharp_s = p - 1 } c_line* %{ emit_comment(@sharp_s, p) }
922
+ ;
923
+
924
+ # A newline in non-literal context always interoperates with
925
+ # here document logic and can always be escaped by a backslash,
926
+ # still interoperating with here document logic in the same way,
927
+ # yet being invisible to anything else.
928
+ #
929
+ # To demonstrate:
930
+ #
931
+ # foo = <<FOO \
932
+ # bar
933
+ # FOO
934
+ # + 2
935
+ #
936
+ # is equivalent to `foo = "bar\n" + 2`.
937
+
938
+ w_newline = e_heredoc_nl;
939
+
940
+ w_space_newline = w_space | w_newline;
941
+
942
+
889
943
  #
890
944
  # === EXPRESSION PARSING ===
891
945
  #
@@ -1019,9 +1073,7 @@ class Parser::Lexer
1019
1073
  ':'
1020
1074
  => { fhold; fgoto expr_beg; };
1021
1075
 
1022
- # TODO whitespace rule
1023
- c_space;
1024
- e_heredoc_nl;
1076
+ w_space_newline;
1025
1077
 
1026
1078
  c_any
1027
1079
  => { fhold; fgoto expr_end; };
@@ -1037,10 +1089,9 @@ class Parser::Lexer
1037
1089
  expr_endfn := |*
1038
1090
  bareword ':'
1039
1091
  => { emit(:tLABEL, tok(@ts, @te - 1))
1040
- fnext expr_end; fbreak; };
1092
+ fnext expr_beg; fbreak; };
1041
1093
 
1042
- # TODO whitespace rule
1043
- c_space;
1094
+ w_space;
1044
1095
 
1045
1096
  c_any
1046
1097
  => { fhold; fgoto expr_end; };
@@ -1072,11 +1123,7 @@ class Parser::Lexer
1072
1123
  => { emit_table(PUNCTUATION)
1073
1124
  fnext expr_arg; fbreak; };
1074
1125
 
1075
- # TODO whitespace rule
1076
- c_space;
1077
- e_heredoc_nl;
1078
- '\\' e_heredoc_nl;
1079
- '#' c_line*;
1126
+ w_space_newline;
1080
1127
 
1081
1128
  c_any
1082
1129
  => { fhold; fgoto expr_end; };
@@ -1194,7 +1241,7 @@ class Parser::Lexer
1194
1241
  fgoto expr_end;
1195
1242
  };
1196
1243
 
1197
- c_space* ( '#' c_line* )? c_nl
1244
+ w_space? w_newline
1198
1245
  => { fhold; fgoto expr_end; };
1199
1246
 
1200
1247
  c_any
@@ -1227,7 +1274,7 @@ class Parser::Lexer
1227
1274
  => { emit(:kDO_BLOCK)
1228
1275
  fnext expr_value; };
1229
1276
 
1230
- c_space;
1277
+ w_space;
1231
1278
 
1232
1279
  c_any
1233
1280
  => { fhold; fgoto expr_end; };
@@ -1245,11 +1292,8 @@ class Parser::Lexer
1245
1292
  => { emit_table(KEYWORDS)
1246
1293
  fnext expr_beg; fbreak; };
1247
1294
 
1248
- # TODO whitespace rule
1249
- c_space;
1250
- '#' c_line*;
1251
-
1252
- e_heredoc_nl
1295
+ w_space;
1296
+ w_newline
1253
1297
  => { fhold; fgoto expr_end; };
1254
1298
 
1255
1299
  c_any
@@ -1487,14 +1531,7 @@ class Parser::Lexer
1487
1531
  # WHITESPACE
1488
1532
  #
1489
1533
 
1490
- # TODO whitespace rule
1491
- c_space;
1492
- e_heredoc_nl;
1493
- '\\' e_heredoc_nl;
1494
-
1495
- '#' c_line* c_eol
1496
- => { @comments << tok
1497
- fhold; };
1534
+ w_space_newline;
1498
1535
 
1499
1536
  e_heredoc_nl '=begin' ( c_space | c_eol )
1500
1537
  => { p = @ts - 1
@@ -1523,8 +1560,7 @@ class Parser::Lexer
1523
1560
  => { p = @ts - 1
1524
1561
  fgoto expr_end; };
1525
1562
 
1526
- # TODO whitespace rule
1527
- c_space;
1563
+ w_space;
1528
1564
 
1529
1565
  c_any
1530
1566
  => { fhold; fgoto expr_beg; };
@@ -1779,8 +1815,13 @@ class Parser::Lexer
1779
1815
  # WHITESPACE
1780
1816
  #
1781
1817
 
1782
- # TODO whitespace rule
1783
- '\\' e_heredoc_nl;
1818
+ w_space;
1819
+ w_newline
1820
+ => { fgoto leading_dot; };
1821
+
1822
+ ';'
1823
+ => { emit_table(PUNCTUATION)
1824
+ fnext expr_value; fbreak; };
1784
1825
 
1785
1826
  '\\' c_line {
1786
1827
  diagnostic :error, Parser::ERRORS[:bare_backslash],
@@ -1788,18 +1829,6 @@ class Parser::Lexer
1788
1829
  fhold;
1789
1830
  };
1790
1831
 
1791
- c_space+;
1792
-
1793
- '#' c_line*
1794
- => { @comments << tok(@ts, @te + 1) };
1795
-
1796
- e_heredoc_nl
1797
- => { fgoto leading_dot; };
1798
-
1799
- ';'
1800
- => { emit_table(PUNCTUATION)
1801
- fnext expr_value; fbreak; };
1802
-
1803
1832
  c_any
1804
1833
  => {
1805
1834
  message = Parser::ERRORS[:unexpected] % { :character => tok.inspect[1..-2] }
@@ -1827,30 +1856,26 @@ class Parser::Lexer
1827
1856
  #
1828
1857
 
1829
1858
  line_comment := |*
1830
- '=end' c_line* c_nl
1831
- => { @comments << tok
1832
- fgoto line_begin; };
1859
+ '=end' c_line* c_nl?
1860
+ => {
1861
+ emit_comment(@eq_begin_s, @te)
1862
+ fgoto line_begin;
1863
+ };
1833
1864
 
1834
- c_line* c_nl
1835
- => { @comments << tok };
1865
+ c_any;
1836
1866
 
1837
1867
  c_eof
1838
1868
  => {
1839
- # TODO better location information here
1840
- diagnostic :fatal, Parser::ERRORS[:embedded_document], range(p - 1, p)
1869
+ diagnostic :fatal, Parser::ERRORS[:embedded_document],
1870
+ range(@eq_begin_s, @eq_begin_s + '=begin'.length)
1841
1871
  };
1842
1872
  *|;
1843
1873
 
1844
1874
  line_begin := |*
1845
- # TODO whitespace rule
1846
- c_space_nl+;
1847
-
1848
- '#' c_line* c_eol
1849
- => { @comments << tok
1850
- fhold; };
1875
+ w_space_newline;
1851
1876
 
1852
1877
  '=begin' ( c_space | c_eol )
1853
- => { @comments << tok(@ts, @te)
1878
+ => { @eq_begin_s = @ts
1854
1879
  fgoto line_comment; };
1855
1880
 
1856
1881
  '__END__' c_eol
@@ -2,19 +2,10 @@ module Parser
2
2
 
3
3
  module Lexer::Explanation
4
4
 
5
- def self.included(klass)
6
- klass.class_exec do
7
- alias_method :state_before_explanation=, :state=
8
- alias_method :advance_before_explanation, :advance
9
-
10
- remove_method :state=, :advance
11
- end
12
- end
13
-
14
5
  # Like #advance, but also pretty-print the token and its position
15
6
  # in the stream to `stdout`.
16
7
  def advance
17
- type, (val, range) = advance_before_explanation
8
+ type, (val, range) = super
18
9
 
19
10
  puts decorate(range,
20
11
  "\e[0;32m#{type} #{val.inspect}\e[0m",
@@ -27,7 +18,7 @@ module Parser
27
18
  puts " \e[1;33m>>> STATE SET <<<\e[0m " +
28
19
  "#{new_state.to_s.ljust(12)} #{@cond} #{@cmdarg}".rjust(66)
29
20
 
30
- self.state_before_explanation = new_state
21
+ super
31
22
  end
32
23
 
33
24
  private
@@ -11,8 +11,7 @@ module Parser
11
11
  private
12
12
 
13
13
  def assignment?(node)
14
- [:lvasgn, :ivasgn, :gvasgn,
15
- :cvasgn, :cvdecl, :cdecl].include?(node.type)
14
+ [:lvasgn, :ivasgn, :gvasgn, :cvasgn, :casgn].include?(node.type)
16
15
  end
17
16
 
18
17
  def remove(range)
@@ -1155,7 +1155,6 @@ rule
1155
1155
  diagnostic(:error, :class_in_def, val[0])
1156
1156
  end
1157
1157
 
1158
- @comments.push @lexer.clear_comments
1159
1158
  @static_env.extend_static
1160
1159
  }
1161
1160
  bodystmt kEND
@@ -1166,7 +1165,6 @@ rule
1166
1165
  val[4], val[5])
1167
1166
 
1168
1167
  @static_env.unextend
1169
- @lexer.clear_comments
1170
1168
  }
1171
1169
  | kCLASS tLSHFT expr term
1172
1170
  {
@@ -1181,7 +1179,6 @@ rule
1181
1179
  val[5], val[6])
1182
1180
 
1183
1181
  @static_env.unextend
1184
- @lexer.clear_comments
1185
1182
 
1186
1183
  @def_level = val[4]
1187
1184
  }
@@ -1191,7 +1188,6 @@ rule
1191
1188
  diagnostic(:error, :module_in_def, val[0])
1192
1189
  end
1193
1190
 
1194
- @comments.push @lexer.clear_comments
1195
1191
  @static_env.extend_static
1196
1192
  }
1197
1193
  bodystmt kEND
@@ -1200,26 +1196,22 @@ rule
1200
1196
  val[3], val[4])
1201
1197
 
1202
1198
  @static_env.unextend
1203
- @lexer.clear_comments
1204
1199
  }
1205
1200
  | kDEF fname
1206
1201
  {
1207
- @comments.push @lexer.clear_comments
1208
1202
  @def_level += 1
1209
1203
  @static_env.extend_static
1210
1204
  }
1211
1205
  f_arglist bodystmt kEND
1212
1206
  {
1213
1207
  result = @builder.def_method(val[0], val[1],
1214
- val[3], val[4], val[5], @comments.pop)
1208
+ val[3], val[4], val[5])
1215
1209
 
1216
1210
  @static_env.unextend
1217
1211
  @def_level -= 1
1218
- @lexer.clear_comments
1219
1212
  }
1220
1213
  | kDEF singleton dot_or_colon
1221
1214
  {
1222
- @comments.push @lexer.clear_comments
1223
1215
  @lexer.state = :expr_fname
1224
1216
  }
1225
1217
  fname
@@ -1230,11 +1222,10 @@ rule
1230
1222
  f_arglist bodystmt kEND
1231
1223
  {
1232
1224
  result = @builder.def_singleton(val[0], val[1], val[2],
1233
- val[4], val[6], val[7], val[8], @comments.pop)
1225
+ val[4], val[6], val[7], val[8])
1234
1226
 
1235
1227
  @static_env.unextend
1236
1228
  @def_level -= 1
1237
- @lexer.clear_comments
1238
1229
  }
1239
1230
  | kBREAK
1240
1231
  {
@@ -1112,7 +1112,6 @@ rule
1112
1112
  diagnostic(:error, :class_in_def, val[0])
1113
1113
  end
1114
1114
 
1115
- @comments.push @lexer.clear_comments
1116
1115
  @static_env.extend_static
1117
1116
  }
1118
1117
  bodystmt kEND
@@ -1123,7 +1122,6 @@ rule
1123
1122
  val[4], val[5])
1124
1123
 
1125
1124
  @static_env.unextend
1126
- @lexer.clear_comments
1127
1125
  }
1128
1126
  | kCLASS tLSHFT expr term
1129
1127
  {
@@ -1138,7 +1136,6 @@ rule
1138
1136
  val[5], val[6])
1139
1137
 
1140
1138
  @static_env.unextend
1141
- @lexer.clear_comments
1142
1139
 
1143
1140
  @def_level = val[4]
1144
1141
  }
@@ -1148,7 +1145,6 @@ rule
1148
1145
  diagnostic(:error, :module_in_def, val[0])
1149
1146
  end
1150
1147
 
1151
- @comments.push @lexer.clear_comments
1152
1148
  @static_env.extend_static
1153
1149
  }
1154
1150
  bodystmt kEND
@@ -1157,26 +1153,22 @@ rule
1157
1153
  val[3], val[4])
1158
1154
 
1159
1155
  @static_env.unextend
1160
- @lexer.clear_comments
1161
1156
  }
1162
1157
  | kDEF fname
1163
1158
  {
1164
- @comments.push @lexer.clear_comments
1165
1159
  @def_level += 1
1166
1160
  @static_env.extend_static
1167
1161
  }
1168
1162
  f_arglist bodystmt kEND
1169
1163
  {
1170
1164
  result = @builder.def_method(val[0], val[1],
1171
- val[3], val[4], val[5], @comments.pop)
1165
+ val[3], val[4], val[5])
1172
1166
 
1173
1167
  @static_env.unextend
1174
1168
  @def_level -= 1
1175
- @lexer.clear_comments
1176
1169
  }
1177
1170
  | kDEF singleton dot_or_colon
1178
1171
  {
1179
- @comments.push @lexer.clear_comments
1180
1172
  @lexer.state = :expr_fname
1181
1173
  }
1182
1174
  fname
@@ -1187,11 +1179,10 @@ rule
1187
1179
  f_arglist bodystmt kEND
1188
1180
  {
1189
1181
  result = @builder.def_singleton(val[0], val[1], val[2],
1190
- val[4], val[6], val[7], val[8], @comments.pop)
1182
+ val[4], val[6], val[7], val[8])
1191
1183
 
1192
1184
  @static_env.unextend
1193
1185
  @def_level -= 1
1194
- @lexer.clear_comments
1195
1186
  }
1196
1187
  | kBREAK
1197
1188
  {
@@ -2103,9 +2094,7 @@ keyword_variable: kNIL
2103
2094
  }
2104
2095
  | tLABEL arg_value
2105
2096
  {
2106
- # TODO: Extract colon
2107
- key = @builder.symbol(val[0])
2108
- result = @builder.pair(key, nil, val[1])
2097
+ result = @builder.pair_keyword(val[0], val[1])
2109
2098
  }
2110
2099
 
2111
2100
  operation: tIDENTIFIER | tCONSTANT | tFID