yarp 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +48 -1
  3. data/Makefile +5 -1
  4. data/README.md +4 -3
  5. data/config.yml +461 -150
  6. data/docs/configuration.md +1 -0
  7. data/docs/encoding.md +5 -5
  8. data/docs/ruby_api.md +2 -0
  9. data/docs/serialization.md +3 -3
  10. data/docs/testing.md +2 -2
  11. data/ext/yarp/api_node.c +810 -199
  12. data/ext/yarp/extension.c +94 -31
  13. data/ext/yarp/extension.h +2 -2
  14. data/include/yarp/ast.h +653 -150
  15. data/include/yarp/defines.h +2 -1
  16. data/include/yarp/diagnostic.h +3 -3
  17. data/include/yarp/enc/yp_encoding.h +10 -10
  18. data/include/yarp/node.h +10 -0
  19. data/include/yarp/parser.h +19 -19
  20. data/include/yarp/regexp.h +1 -1
  21. data/include/yarp/unescape.h +7 -5
  22. data/include/yarp/util/yp_buffer.h +3 -0
  23. data/include/yarp/util/yp_char.h +16 -16
  24. data/include/yarp/util/yp_constant_pool.h +2 -2
  25. data/include/yarp/util/yp_newline_list.h +7 -4
  26. data/include/yarp/util/yp_string.h +4 -4
  27. data/include/yarp/util/yp_string_list.h +0 -3
  28. data/include/yarp/util/yp_strpbrk.h +1 -1
  29. data/include/yarp/version.h +2 -2
  30. data/include/yarp.h +14 -3
  31. data/lib/yarp/desugar_visitor.rb +204 -0
  32. data/lib/yarp/ffi.rb +27 -1
  33. data/lib/yarp/lex_compat.rb +93 -25
  34. data/lib/yarp/mutation_visitor.rb +683 -0
  35. data/lib/yarp/node.rb +3121 -597
  36. data/lib/yarp/serialize.rb +198 -126
  37. data/lib/yarp.rb +53 -7
  38. data/src/diagnostic.c +1 -1
  39. data/src/enc/yp_big5.c +15 -42
  40. data/src/enc/yp_euc_jp.c +16 -43
  41. data/src/enc/yp_gbk.c +19 -46
  42. data/src/enc/yp_shift_jis.c +16 -43
  43. data/src/enc/yp_tables.c +36 -38
  44. data/src/enc/yp_unicode.c +20 -25
  45. data/src/enc/yp_windows_31j.c +16 -43
  46. data/src/node.c +1444 -836
  47. data/src/prettyprint.c +324 -103
  48. data/src/regexp.c +21 -21
  49. data/src/serialize.c +429 -276
  50. data/src/token_type.c +2 -2
  51. data/src/unescape.c +184 -136
  52. data/src/util/yp_buffer.c +7 -2
  53. data/src/util/yp_char.c +34 -34
  54. data/src/util/yp_constant_pool.c +4 -4
  55. data/src/util/yp_memchr.c +1 -1
  56. data/src/util/yp_newline_list.c +14 -3
  57. data/src/util/yp_string.c +22 -20
  58. data/src/util/yp_string_list.c +0 -6
  59. data/src/util/yp_strncasecmp.c +3 -6
  60. data/src/util/yp_strpbrk.c +8 -8
  61. data/src/yarp.c +1504 -615
  62. data/yarp.gemspec +3 -1
  63. metadata +4 -2
data/ext/yarp/extension.c CHANGED
@@ -83,7 +83,21 @@ dump(int argc, VALUE *argv, VALUE self) {
83
83
 
84
84
  yp_string_t input;
85
85
  input_load_string(&input, string);
86
- return dump_input(&input, check_string(filepath));
86
+
87
+ #ifdef YARP_DEBUG_MODE_BUILD
88
+ size_t length = yp_string_length(&input);
89
+ char* dup = malloc(length);
90
+ memcpy(dup, yp_string_source(&input), length);
91
+ yp_string_constant_init(&input, dup, length);
92
+ #endif
93
+
94
+ VALUE value = dump_input(&input, check_string(filepath));
95
+
96
+ #ifdef YARP_DEBUG_MODE_BUILD
97
+ free(dup);
98
+ #endif
99
+
100
+ return value;
87
101
  }
88
102
 
89
103
  // Dump the AST corresponding to the given file to a string.
@@ -198,52 +212,67 @@ typedef struct {
198
212
  VALUE source;
199
213
  VALUE tokens;
200
214
  rb_encoding *encoding;
201
- } lex_data_t;
215
+ } parse_lex_data_t;
202
216
 
203
217
  // This is passed as a callback to the parser. It gets called every time a new
204
218
  // token is found. Once found, we initialize a new instance of Token and push it
205
219
  // onto the tokens array.
206
220
  static void
207
- lex_token(void *data, yp_parser_t *parser, yp_token_t *token) {
208
- lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
221
+ parse_lex_token(void *data, yp_parser_t *parser, yp_token_t *token) {
222
+ parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
209
223
 
210
224
  VALUE yields = rb_ary_new_capa(2);
211
- rb_ary_push(yields, yp_token_new(parser, token, lex_data->encoding, lex_data->source));
225
+ rb_ary_push(yields, yp_token_new(parser, token, parse_lex_data->encoding, parse_lex_data->source));
212
226
  rb_ary_push(yields, INT2FIX(parser->lex_state));
213
227
 
214
- rb_ary_push(lex_data->tokens, yields);
228
+ rb_ary_push(parse_lex_data->tokens, yields);
215
229
  }
216
230
 
217
231
  // This is called whenever the encoding changes based on the magic comment at
218
232
  // the top of the file. We use it to update the encoding that we are using to
219
233
  // create tokens.
220
234
  static void
221
- lex_encoding_changed_callback(yp_parser_t *parser) {
222
- lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
223
- lex_data->encoding = rb_enc_find(parser->encoding.name);
235
+ parse_lex_encoding_changed_callback(yp_parser_t *parser) {
236
+ parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
237
+ parse_lex_data->encoding = rb_enc_find(parser->encoding.name);
238
+
239
+ // Since the encoding changed, we need to go back and change the encoding of
240
+ // the tokens that were already lexed. This is only going to end up being
241
+ // one or two tokens, since the encoding can only change at the top of the
242
+ // file.
243
+ VALUE tokens = parse_lex_data->tokens;
244
+ for (long index = 0; index < RARRAY_LEN(tokens); index++) {
245
+ VALUE yields = rb_ary_entry(tokens, index);
246
+ VALUE token = rb_ary_entry(yields, 0);
247
+
248
+ VALUE value = rb_ivar_get(token, rb_intern("@value"));
249
+ rb_enc_associate(value, parse_lex_data->encoding);
250
+ ENC_CODERANGE_CLEAR(value);
251
+ }
224
252
  }
225
253
 
226
- // Return an array of tokens corresponding to the given source.
254
+ // Parse the given input and return a ParseResult containing just the tokens or
255
+ // the nodes and tokens.
227
256
  static VALUE
228
- lex_input(yp_string_t *input, const char *filepath) {
257
+ parse_lex_input(yp_string_t *input, const char *filepath, bool return_nodes) {
229
258
  yp_parser_t parser;
230
259
  yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
231
- yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback);
260
+ yp_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
232
261
 
233
262
  VALUE offsets = rb_ary_new();
234
- VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
263
+ VALUE source_argv[] = { rb_str_new((const char *) yp_string_source(input), yp_string_length(input)), offsets };
235
264
  VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
236
265
 
237
- lex_data_t lex_data = {
266
+ parse_lex_data_t parse_lex_data = {
238
267
  .source = source,
239
268
  .tokens = rb_ary_new(),
240
269
  .encoding = rb_utf8_encoding()
241
270
  };
242
271
 
243
- lex_data_t *data = &lex_data;
272
+ parse_lex_data_t *data = &parse_lex_data;
244
273
  yp_lex_callback_t lex_callback = (yp_lex_callback_t) {
245
274
  .data = (void *) data,
246
- .callback = lex_token,
275
+ .callback = parse_lex_token,
247
276
  };
248
277
 
249
278
  parser.lex_callback = &lex_callback;
@@ -256,20 +285,26 @@ lex_input(yp_string_t *input, const char *filepath) {
256
285
  rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
257
286
  }
258
287
 
288
+ VALUE value;
289
+ if (return_nodes) {
290
+ value = rb_ary_new_capa(2);
291
+ rb_ary_push(value, yp_ast_new(&parser, node, parse_lex_data.encoding));
292
+ rb_ary_push(value, parse_lex_data.tokens);
293
+ } else {
294
+ value = parse_lex_data.tokens;
295
+ }
296
+
259
297
  VALUE result_argv[] = {
260
- lex_data.tokens,
298
+ value,
261
299
  parser_comments(&parser, source),
262
- parser_errors(&parser, lex_data.encoding, source),
263
- parser_warnings(&parser, lex_data.encoding, source),
300
+ parser_errors(&parser, parse_lex_data.encoding, source),
301
+ parser_warnings(&parser, parse_lex_data.encoding, source),
264
302
  source
265
303
  };
266
304
 
267
- VALUE result = rb_class_new_instance(5, result_argv, rb_cYARPParseResult);
268
-
269
305
  yp_node_destroy(&parser, node);
270
306
  yp_parser_free(&parser);
271
-
272
- return result;
307
+ return rb_class_new_instance(5, result_argv, rb_cYARPParseResult);
273
308
  }
274
309
 
275
310
  // Return an array of tokens corresponding to the given string.
@@ -281,7 +316,8 @@ lex(int argc, VALUE *argv, VALUE self) {
281
316
 
282
317
  yp_string_t input;
283
318
  input_load_string(&input, string);
284
- return lex_input(&input, check_string(filepath));
319
+
320
+ return parse_lex_input(&input, check_string(filepath), false);
285
321
  }
286
322
 
287
323
  // Return an array of tokens corresponding to the given file.
@@ -292,7 +328,7 @@ lex_file(VALUE self, VALUE filepath) {
292
328
  const char *checked = check_string(filepath);
293
329
  if (!yp_string_mapped_init(&input, checked)) return Qnil;
294
330
 
295
- VALUE value = lex_input(&input, checked);
331
+ VALUE value = parse_lex_input(&input, checked, false);
296
332
  yp_string_free(&input);
297
333
 
298
334
  return value;
@@ -368,6 +404,32 @@ parse_file(VALUE self, VALUE filepath) {
368
404
  return value;
369
405
  }
370
406
 
407
+ // Parse the given string and return a ParseResult instance.
408
+ static VALUE
409
+ parse_lex(int argc, VALUE *argv, VALUE self) {
410
+ VALUE string;
411
+ VALUE filepath;
412
+ rb_scan_args(argc, argv, "11", &string, &filepath);
413
+
414
+ yp_string_t input;
415
+ input_load_string(&input, string);
416
+ return parse_lex_input(&input, check_string(filepath), true);
417
+ }
418
+
419
+ // Parse and lex the given file and return a ParseResult instance.
420
+ static VALUE
421
+ parse_lex_file(VALUE self, VALUE filepath) {
422
+ yp_string_t input;
423
+
424
+ const char *checked = check_string(filepath);
425
+ if (!yp_string_mapped_init(&input, checked)) return Qnil;
426
+
427
+ VALUE value = parse_lex_input(&input, checked, true);
428
+ yp_string_free(&input);
429
+
430
+ return value;
431
+ }
432
+
371
433
  /******************************************************************************/
372
434
  /* Utility functions exposed to make testing easier */
373
435
  /******************************************************************************/
@@ -380,7 +442,7 @@ named_captures(VALUE self, VALUE source) {
380
442
  yp_string_list_t string_list;
381
443
  yp_string_list_init(&string_list);
382
444
 
383
- if (!yp_regexp_named_capture_group_names(RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, &yp_encoding_utf_8)) {
445
+ if (!yp_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, &yp_encoding_utf_8)) {
384
446
  yp_string_list_free(&string_list);
385
447
  return Qnil;
386
448
  }
@@ -388,7 +450,7 @@ named_captures(VALUE self, VALUE source) {
388
450
  VALUE names = rb_ary_new();
389
451
  for (size_t index = 0; index < string_list.length; index++) {
390
452
  const yp_string_t *string = &string_list.strings[index];
391
- rb_ary_push(names, rb_str_new(yp_string_source(string), yp_string_length(string)));
453
+ rb_ary_push(names, rb_str_new((const char *) yp_string_source(string), yp_string_length(string)));
392
454
  }
393
455
 
394
456
  yp_string_list_free(&string_list);
@@ -401,8 +463,8 @@ static VALUE
401
463
  unescape(VALUE source, yp_unescape_type_t unescape_type) {
402
464
  yp_string_t result;
403
465
 
404
- if (yp_unescape_string(RSTRING_PTR(source), RSTRING_LEN(source), unescape_type, &result)) {
405
- VALUE str = rb_str_new(yp_string_source(&result), yp_string_length(&result));
466
+ if (yp_unescape_string((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), unescape_type, &result)) {
467
+ VALUE str = rb_str_new((const char *) yp_string_source(&result), yp_string_length(&result));
406
468
  yp_string_free(&result);
407
469
  return str;
408
470
  } else {
@@ -436,7 +498,7 @@ static VALUE
436
498
  memsize(VALUE self, VALUE string) {
437
499
  yp_parser_t parser;
438
500
  size_t length = RSTRING_LEN(string);
439
- yp_parser_init(&parser, RSTRING_PTR(string), length, NULL);
501
+ yp_parser_init(&parser, (const uint8_t *) RSTRING_PTR(string), length, NULL);
440
502
 
441
503
  yp_node_t *node = yp_parse(&parser);
442
504
  yp_memsize_t memsize;
@@ -521,7 +583,6 @@ Init_yarp(void) {
521
583
  // Define the version string here so that we can use the constants defined
522
584
  // in yarp.h.
523
585
  rb_define_const(rb_cYARP, "VERSION", rb_str_new2(EXPECTED_YARP_VERSION));
524
-
525
586
  rb_define_const(rb_cYARP, "BACKEND", ID2SYM(rb_intern("CExtension")));
526
587
 
527
588
  // First, the functions that have to do with lexing and parsing.
@@ -531,6 +592,8 @@ Init_yarp(void) {
531
592
  rb_define_singleton_method(rb_cYARP, "lex_file", lex_file, 1);
532
593
  rb_define_singleton_method(rb_cYARP, "parse", parse, -1);
533
594
  rb_define_singleton_method(rb_cYARP, "parse_file", parse_file, 1);
595
+ rb_define_singleton_method(rb_cYARP, "parse_lex", parse_lex, -1);
596
+ rb_define_singleton_method(rb_cYARP, "parse_lex_file", parse_lex_file, 1);
534
597
 
535
598
  // Next, the functions that will be called by the parser to perform various
536
599
  // internal tasks. We expose these to make them easier to test.
data/ext/yarp/extension.h CHANGED
@@ -1,12 +1,12 @@
1
1
  #ifndef YARP_EXT_NODE_H
2
2
  #define YARP_EXT_NODE_H
3
3
 
4
+ #define EXPECTED_YARP_VERSION "0.10.0"
5
+
4
6
  #include <ruby.h>
5
7
  #include <ruby/encoding.h>
6
8
  #include "yarp.h"
7
9
 
8
- #define EXPECTED_YARP_VERSION "0.8.0"
9
-
10
10
  VALUE yp_source_new(yp_parser_t *parser);
11
11
  VALUE yp_token_new(yp_parser_t *parser, yp_token_t *token, rb_encoding *encoding, VALUE source);
12
12
  VALUE yp_ast_new(yp_parser_t *parser, yp_node_t *node, rb_encoding *encoding);