rbs 3.10.0.pre.1 → 3.10.0.pre.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7b19eff56496de03c74195186b75b24f8f43140ebba2198c5e32ab5fe539d04b
4
- data.tar.gz: 63e7ebea19be4ce1caa1a9feed6147204f1f82c1349d6b0be13672de70354c96
3
+ metadata.gz: 2dde36e1704f20a8ad83c8917f449343563d933baf90673a17d2570ec7c44f5a
4
+ data.tar.gz: 051b863ff0f5fac88ff4ab53f3a8434601c3c42803495d354ad0a004c27e7655
5
5
  SHA512:
6
- metadata.gz: 0d77ffeb469a4dab695d55341a7b2def82c5501c5ddd0d91264cd4d8fbe265f53c46a0d0f18d5fbfad8155cebaa361b0b35472cfc1f00ac14c5076c7b99c2e2c
7
- data.tar.gz: 9f53bd43380fec3f6df8f94e7b8e890f746d5b3df1bc61d4d443132871363176340cad548ca10fbc54ac1c5070e260ca3d576e679304b31b937ce697f921884e
6
+ metadata.gz: 9820f43da6cb10c74015b212a9b37f1f1b4b46e1d8d9ec0c7e1c7304bdc49024691db5f03b776179715161e75eec166c5179eeab63adbd4e1044a811de7f8bc6
7
+ data.tar.gz: 1b75689099bd54f88dd2fd30b73fa98a2d0b76894c817b59e236109eb4924a9141d52d8454a82aef98bbe6dbb05ee5b8da9bf6d66254cba88797c39de020ddd5
data/docs/encoding.md ADDED
@@ -0,0 +1,56 @@
1
+ # RBS File Encoding
2
+
3
+ ## Best Practice
4
+
5
+ **Use UTF-8** for both file encoding and your system locale.
6
+
7
+ ## Supported Encodings
8
+
9
+ RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support).
10
+
11
+ **Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ...
12
+
13
+ ## Unicode Codepoint Symbols
14
+
15
+ String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`).
16
+
17
+ When the file encoding is UTF-8, the parser translates Unicode codepoint symbols:
18
+
19
+ ```rbs
20
+ # In UTF-8 encoded files
21
+
22
+ type t = "\u0123" # Translated to the actual Unicode character ģ
23
+ type s = "\u3042" # Translated to the actual Unicode character あ
24
+ ```
25
+
26
+ When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`:
27
+
28
+ ```rbs
29
+ # In non-UTF-8 encoded files
30
+
31
+ type t = "\u0123" # Remains as the literal string "\u0123"
32
+ ```
33
+
34
+ ## Implementation
35
+
36
+ RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`.
37
+
38
+ `Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.)
39
+
40
+ When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding.
41
+
42
+ ### Parsing non UTF-8 RBS source text
43
+
44
+ If you want to work with another encoding, ensure the source string has ASCII compatible encoding.
45
+
46
+ ```ruby
47
+ source = '"日本語"'
48
+ RBS::Parser.parse_type(source.encode(Encoding::EUC_JP)) # => Parses successfully
49
+ RBS::Parser.parse_type(source.encode(Encoding::UTF_32)) # => Returns `nil` since UTF-32 is not ASCII compatible
50
+ ```
51
+
52
+ ### Specifying file encoding
53
+
54
+ Currently, RBS doesn't support specifying file encoding directly.
55
+
56
+ You can use `Encoding.default_external` while the gem loads RBS files from the storage.
@@ -7,8 +7,6 @@
7
7
 
8
8
  #include "rbs_extension.h"
9
9
 
10
- VALUE RBS_Parser;
11
-
12
10
  VALUE RBS;
13
11
  VALUE RBS_AST;
14
12
  VALUE RBS_AST_Declarations;
@@ -33,7 +33,7 @@ void rbs_loc_legacy_alloc_children(rbs_loc *loc, unsigned short cap) {
33
33
  check_children_max(cap);
34
34
 
35
35
  size_t s = RBS_LOC_CHILDREN_SIZE(cap);
36
- loc->children = malloc(s);
36
+ loc->children = (rbs_loc_children *) malloc(s);
37
37
 
38
38
  *loc->children = (rbs_loc_children) {
39
39
  .len = 0,
@@ -50,7 +50,7 @@ static void check_children_cap(rbs_loc *loc) {
50
50
  if (loc->children->len == loc->children->cap) {
51
51
  check_children_max(loc->children->cap + 1);
52
52
  size_t s = RBS_LOC_CHILDREN_SIZE(++loc->children->cap);
53
- loc->children = realloc(loc->children, s);
53
+ loc->children = (rbs_loc_children *) realloc(loc->children, s);
54
54
  }
55
55
  }
56
56
  }
@@ -86,12 +86,12 @@ void rbs_loc_free(rbs_loc *loc) {
86
86
  }
87
87
 
88
88
  static void rbs_loc_mark(void *ptr) {
89
- rbs_loc *loc = ptr;
89
+ rbs_loc *loc = (rbs_loc *) ptr;
90
90
  rb_gc_mark(loc->buffer);
91
91
  }
92
92
 
93
93
  static size_t rbs_loc_memsize(const void *ptr) {
94
- const rbs_loc *loc = ptr;
94
+ const rbs_loc *loc = (const rbs_loc *) ptr;
95
95
  if (loc->children == NULL) {
96
96
  return sizeof(rbs_loc);
97
97
  } else {
@@ -117,7 +117,7 @@ static VALUE location_s_allocate(VALUE klass) {
117
117
  }
118
118
 
119
119
  rbs_loc *rbs_check_location(VALUE obj) {
120
- return rb_check_typeddata(obj, &location_type);
120
+ return (rbs_loc *) rb_check_typeddata(obj, &location_type);
121
121
  }
122
122
 
123
123
  static VALUE location_initialize(VALUE self, VALUE buffer, VALUE start_pos, VALUE end_pos) {
@@ -187,18 +187,10 @@ static VALUE parse_method_type_try(VALUE a) {
187
187
  }
188
188
 
189
189
  rbs_method_type_t *method_type = NULL;
190
- rbs_parse_method_type(parser, &method_type);
190
+ rbs_parse_method_type(parser, &method_type, RB_TEST(arg->require_eof));
191
191
 
192
192
  raise_error_if_any(parser, arg->buffer);
193
193
 
194
- if (RB_TEST(arg->require_eof)) {
195
- rbs_parser_advance(parser);
196
- if (parser->current_token.type != pEOF) {
197
- rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF));
198
- raise_error(parser->error, arg->buffer);
199
- }
200
- }
201
-
202
194
  rbs_translation_context_t ctx = rbs_translation_context_create(
203
195
  &parser->constant_pool,
204
196
  arg->buffer,
data/include/rbs/parser.h CHANGED
@@ -44,7 +44,7 @@ typedef struct rbs_error_t {
44
44
  * An RBS parser is a LL(3) parser.
45
45
  * */
46
46
  typedef struct {
47
- rbs_lexer_t *rbs_lexer_t;
47
+ rbs_lexer_t *lexer;
48
48
 
49
49
  rbs_token_t current_token;
50
50
  rbs_token_t next_token; /* The first lookahead token */
@@ -127,7 +127,7 @@ rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line
127
127
  void rbs_parser_set_error(rbs_parser_t *parser, rbs_token_t tok, bool syntax_error, const char *fmt, ...) RBS_ATTRIBUTE_FORMAT(4, 5);
128
128
 
129
129
  bool rbs_parse_type(rbs_parser_t *parser, rbs_node_t **type);
130
- bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type);
130
+ bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type, bool require_eof);
131
131
  bool rbs_parse_signature(rbs_parser_t *parser, rbs_signature_t **signature);
132
132
 
133
133
  bool rbs_parse_type_params(rbs_parser_t *parser, bool module_type_params, rbs_node_list_t **params);
data/include/rbs/string.h CHANGED
@@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self);
44
44
  */
45
45
  bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs);
46
46
 
47
- unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string);
48
-
49
47
  #endif
@@ -4,6 +4,7 @@
4
4
  #include <stddef.h>
5
5
  #include "rbs/util/rbs_allocator.h"
6
6
  #include "rbs/string.h"
7
+ #include "rbs/util/rbs_encoding.h"
7
8
 
8
9
  /**
9
10
  * Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE.
@@ -18,6 +19,6 @@
18
19
  *
19
20
  * @returns A new owned string that will be freed when the allocator is freed.
20
21
  * */
21
- rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input);
22
+ rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding);
22
23
 
23
24
  #endif // RBS_RBS_UNESCAPE_H
data/lib/rbs/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RBS
4
- VERSION = "3.10.0.pre.1"
4
+ VERSION = "3.10.0.pre.2"
5
5
  end
data/src/location.c CHANGED
@@ -8,7 +8,7 @@
8
8
  void rbs_loc_alloc_children(rbs_allocator_t *allocator, rbs_location_t *loc, size_t capacity) {
9
9
  RBS_ASSERT(capacity <= sizeof(rbs_loc_entry_bitmap) * 8, "Capacity %zu is too large. Max is %zu", capacity, sizeof(rbs_loc_entry_bitmap) * 8);
10
10
 
11
- loc->children = rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children));
11
+ loc->children = (rbs_loc_children *) rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children));
12
12
 
13
13
  loc->children->len = 0;
14
14
  loc->children->required_p = 0;
data/src/parser.c CHANGED
@@ -20,12 +20,12 @@
20
20
  strlen(str) \
21
21
  )
22
22
 
23
- #define INTERN_TOKEN(parser, tok) \
24
- rbs_constant_pool_insert_shared_with_encoding( \
25
- &parser->constant_pool, \
26
- (const uint8_t *) rbs_peek_token(parser->rbs_lexer_t, tok), \
27
- rbs_token_bytes(tok), \
28
- (void *) parser->rbs_lexer_t->encoding \
23
+ #define INTERN_TOKEN(parser, tok) \
24
+ rbs_constant_pool_insert_shared_with_encoding( \
25
+ &parser->constant_pool, \
26
+ (const uint8_t *) rbs_peek_token(parser->lexer, tok), \
27
+ rbs_token_bytes(tok), \
28
+ parser->lexer->encoding \
29
29
  )
30
30
 
31
31
  #define KEYWORD_CASES \
@@ -128,7 +128,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type);
128
128
  static rbs_string_t rbs_parser_peek_current_token(rbs_parser_t *parser) {
129
129
  rbs_range_t rg = parser->current_token.range;
130
130
 
131
- const char *start = parser->rbs_lexer_t->string.start + rg.start.byte_pos;
131
+ const char *start = parser->lexer->string.start + rg.start.byte_pos;
132
132
  size_t length = rg.end.byte_pos - rg.start.byte_pos;
133
133
 
134
134
  return rbs_string_new(start, start + length);
@@ -189,7 +189,7 @@ static bool parse_type_name(rbs_parser_t *parser, TypeNameKind kind, rbs_range_t
189
189
  .end = parser->current_token.range.end
190
190
  };
191
191
  rbs_location_t *loc = rbs_location_new(ALLOCATOR(), namespace_range);
192
- rbs_namespace_t *namespace = rbs_namespace_new(ALLOCATOR(), loc, path, absolute);
192
+ rbs_namespace_t *ns = rbs_namespace_new(ALLOCATOR(), loc, path, absolute);
193
193
 
194
194
  switch (parser->current_token.type) {
195
195
  case tLIDENT:
@@ -213,7 +213,7 @@ success: {
213
213
  rbs_location_t *symbolLoc = rbs_location_current_token(parser);
214
214
  rbs_constant_id_t name = INTERN_TOKEN(parser, parser->current_token);
215
215
  rbs_ast_symbol_t *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, name);
216
- *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), namespace, symbol);
216
+ *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), ns, symbol);
217
217
  return true;
218
218
  }
219
219
 
@@ -317,7 +317,7 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_
317
317
  return false;
318
318
  }
319
319
 
320
- rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
320
+ rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->lexer->encoding);
321
321
  rbs_location_t *symbolLoc = rbs_location_current_token(parser);
322
322
  rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
323
323
  rbs_ast_symbol_t *name = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -334,9 +334,9 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_
334
334
  static rbs_constant_id_t intern_token_start_end(rbs_parser_t *parser, rbs_token_t start_token, rbs_token_t end_token) {
335
335
  return rbs_constant_pool_insert_shared_with_encoding(
336
336
  &parser->constant_pool,
337
- (const uint8_t *) rbs_peek_token(parser->rbs_lexer_t, start_token),
337
+ (const uint8_t *) rbs_peek_token(parser->lexer, start_token),
338
338
  end_token.range.end.byte_pos - start_token.range.start.byte_pos,
339
- parser->rbs_lexer_t->encoding
339
+ parser->lexer->encoding
340
340
  );
341
341
  }
342
342
 
@@ -902,7 +902,7 @@ static bool parse_record_attributes(rbs_parser_t *parser, rbs_hash_t **fields) {
902
902
  */
903
903
  NODISCARD
904
904
  static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_types_literal_t **symbol) {
905
- size_t offset_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) ":", (size_t) 1);
905
+ size_t offset_bytes = parser->lexer->encoding->char_width((const uint8_t *) ":", (size_t) 1);
906
906
  size_t bytes = rbs_token_bytes(parser->current_token) - offset_bytes;
907
907
 
908
908
  rbs_ast_symbol_t *literal;
@@ -911,7 +911,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ
911
911
  case tSYMBOL: {
912
912
  rbs_location_t *symbolLoc = rbs_location_current_token(parser);
913
913
 
914
- char *buffer = rbs_peek_token(parser->rbs_lexer_t, parser->current_token);
914
+ char *buffer = rbs_peek_token(parser->lexer, parser->current_token);
915
915
  rbs_constant_id_t constant_id = rbs_constant_pool_insert_shared(
916
916
  &parser->constant_pool,
917
917
  (const uint8_t *) buffer + offset_bytes,
@@ -927,7 +927,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ
927
927
 
928
928
  rbs_string_t symbol = rbs_string_new(current_token.start + offset_bytes, current_token.end);
929
929
 
930
- rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol);
930
+ rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol, parser->lexer->encoding);
931
931
 
932
932
  rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_symbol);
933
933
 
@@ -951,9 +951,9 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ
951
951
  */
952
952
  NODISCARD
953
953
  static bool parse_instance_type(rbs_parser_t *parser, bool parse_alias, rbs_node_t **type) {
954
- TypeNameKind expected_kind = INTERFACE_NAME | CLASS_NAME;
954
+ TypeNameKind expected_kind = (TypeNameKind) (INTERFACE_NAME | CLASS_NAME);
955
955
  if (parse_alias) {
956
- expected_kind |= ALIAS_NAME;
956
+ expected_kind = (TypeNameKind) (expected_kind | ALIAS_NAME);
957
957
  }
958
958
 
959
959
  rbs_range_t name_range;
@@ -1157,7 +1157,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type) {
1157
1157
  case tDQSTRING: {
1158
1158
  rbs_location_t *loc = rbs_location_current_token(parser);
1159
1159
 
1160
- rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
1160
+ rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->lexer->encoding);
1161
1161
  rbs_node_t *literal = (rbs_node_t *) rbs_ast_string_new(ALLOCATOR(), loc, unquoted_str);
1162
1162
  *type = (rbs_node_t *) rbs_types_literal_new(ALLOCATOR(), loc, literal);
1163
1163
  return true;
@@ -1172,7 +1172,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type) {
1172
1172
  return true;
1173
1173
  }
1174
1174
  case tUIDENT: {
1175
- const char *name_str = rbs_peek_token(parser->rbs_lexer_t, parser->current_token);
1175
+ const char *name_str = rbs_peek_token(parser->lexer, parser->current_token);
1176
1176
  size_t name_len = rbs_token_bytes(parser->current_token);
1177
1177
 
1178
1178
  rbs_constant_id_t name = rbs_constant_pool_find(&parser->constant_pool, (const uint8_t *) name_str, name_len);
@@ -1452,7 +1452,7 @@ static bool parser_pop_typevar_table(rbs_parser_t *parser) {
1452
1452
  method_type ::= {} type_params <function>
1453
1453
  */
1454
1454
  // TODO: Should this be NODISCARD?
1455
- bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type) {
1455
+ bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type, bool require_eof) {
1456
1456
  rbs_parser_push_typevar_table(parser, false);
1457
1457
 
1458
1458
  rbs_range_t rg;
@@ -1468,10 +1468,18 @@ bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type
1468
1468
  parse_function_result *result = rbs_allocator_alloc(ALLOCATOR(), parse_function_result);
1469
1469
  CHECK_PARSE(parse_function(parser, false, &result));
1470
1470
 
1471
+ CHECK_PARSE(parser_pop_typevar_table(parser));
1472
+
1471
1473
  rg.end = parser->current_token.range.end;
1472
1474
  type_range.end = rg.end;
1473
1475
 
1474
- CHECK_PARSE(parser_pop_typevar_table(parser));
1476
+ if (require_eof) {
1477
+ rbs_parser_advance(parser);
1478
+ if (parser->current_token.type != pEOF) {
1479
+ rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF));
1480
+ return false;
1481
+ }
1482
+ }
1475
1483
 
1476
1484
  rbs_location_t *loc = rbs_location_new(ALLOCATOR(), rg);
1477
1485
  rbs_loc_alloc_children(ALLOCATOR(), loc, 2);
@@ -1598,14 +1606,16 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota
1598
1606
  rbs_range_t rg = parser->current_token.range;
1599
1607
 
1600
1608
  size_t offset_bytes =
1601
- parser->rbs_lexer_t->encoding->char_width((const uint8_t *) "%", (size_t) 1) +
1602
- parser->rbs_lexer_t->encoding->char_width((const uint8_t *) "a", (size_t) 1);
1609
+ parser->lexer->encoding->char_width((const uint8_t *) "%", (size_t) 1) +
1610
+ parser->lexer->encoding->char_width((const uint8_t *) "a", (size_t) 1);
1603
1611
 
1604
1612
  rbs_string_t str = rbs_string_new(
1605
- parser->rbs_lexer_t->string.start + rg.start.byte_pos + offset_bytes,
1606
- parser->rbs_lexer_t->string.end
1613
+ parser->lexer->string.start + rg.start.byte_pos + offset_bytes,
1614
+ parser->lexer->string.end
1607
1615
  );
1608
- unsigned int open_char = rbs_utf8_string_to_codepoint(str);
1616
+
1617
+ // Assumes the input is ASCII compatible
1618
+ unsigned int open_char = str.start[0];
1609
1619
 
1610
1620
  unsigned int close_char;
1611
1621
 
@@ -1630,8 +1640,8 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota
1630
1640
  return false;
1631
1641
  }
1632
1642
 
1633
- size_t open_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) &open_char, (size_t) 1);
1634
- size_t close_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) &close_char, (size_t) 1);
1643
+ size_t open_bytes = parser->lexer->encoding->char_width((const uint8_t *) &open_char, (size_t) 1);
1644
+ size_t close_bytes = parser->lexer->encoding->char_width((const uint8_t *) &close_char, (size_t) 1);
1635
1645
 
1636
1646
  rbs_string_t current_token = rbs_parser_peek_current_token(parser);
1637
1647
  size_t total_offset = offset_bytes + open_bytes;
@@ -1695,9 +1705,9 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_
1695
1705
 
1696
1706
  rbs_constant_id_t constant_id = rbs_constant_pool_insert_shared_with_encoding(
1697
1707
  &parser->constant_pool,
1698
- (const uint8_t *) parser->rbs_lexer_t->string.start + range->start.byte_pos,
1708
+ (const uint8_t *) parser->lexer->string.start + range->start.byte_pos,
1699
1709
  range->end.byte_pos - range->start.byte_pos,
1700
- parser->rbs_lexer_t->encoding
1710
+ parser->lexer->encoding
1701
1711
  );
1702
1712
 
1703
1713
  rbs_location_t *symbolLoc = rbs_location_new(ALLOCATOR(), *range);
@@ -1718,7 +1728,7 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_
1718
1728
  }
1719
1729
  case tQIDENT: {
1720
1730
  rbs_string_t string = rbs_parser_peek_current_token(parser);
1721
- rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string);
1731
+ rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string, parser->lexer->encoding);
1722
1732
  rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
1723
1733
  rbs_location_t *symbolLoc = rbs_location_current_token(parser);
1724
1734
  *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -1879,7 +1889,7 @@ static bool parse_member_def(rbs_parser_t *parser, bool instance_only, bool acce
1879
1889
  case pLBRACKET:
1880
1890
  case pQUESTION: {
1881
1891
  rbs_method_type_t *method_type = NULL;
1882
- CHECK_PARSE(rbs_parse_method_type(parser, &method_type));
1892
+ CHECK_PARSE(rbs_parse_method_type(parser, &method_type, false));
1883
1893
 
1884
1894
  overload_range.end = parser->current_token.range.end;
1885
1895
  rbs_location_t *loc = rbs_location_new(ALLOCATOR(), overload_range);
@@ -2021,7 +2031,7 @@ static bool parse_mixin_member(rbs_parser_t *parser, bool from_interface, rbs_po
2021
2031
  rbs_type_name_t *name = NULL;
2022
2032
  CHECK_PARSE(class_instance_name(
2023
2033
  parser,
2024
- from_interface ? INTERFACE_NAME : (INTERFACE_NAME | CLASS_NAME),
2034
+ from_interface ? INTERFACE_NAME : (TypeNameKind) (INTERFACE_NAME | CLASS_NAME),
2025
2035
  args,
2026
2036
  &name_range,
2027
2037
  &args_range,
@@ -2486,7 +2496,7 @@ static bool parse_module_self_types(rbs_parser_t *parser, rbs_node_list_t *array
2486
2496
 
2487
2497
  rbs_range_t name_range;
2488
2498
  rbs_type_name_t *module_name = NULL;
2489
- CHECK_PARSE(parse_type_name(parser, CLASS_NAME | INTERFACE_NAME, &name_range, &module_name));
2499
+ CHECK_PARSE(parse_type_name(parser, (TypeNameKind) (CLASS_NAME | INTERFACE_NAME), &name_range, &module_name));
2490
2500
  self_range.end = name_range.end;
2491
2501
 
2492
2502
  rbs_node_list_t *args = rbs_node_list_new(ALLOCATOR());
@@ -2949,7 +2959,7 @@ static bool parse_decl(rbs_parser_t *parser, rbs_node_t **decl) {
2949
2959
  | {} <> (empty -- returns empty namespace)
2950
2960
  */
2951
2961
  NODISCARD
2952
- static bool parse_namespace(rbs_parser_t *parser, rbs_range_t *rg, rbs_namespace_t **namespace) {
2962
+ static bool parse_namespace(rbs_parser_t *parser, rbs_range_t *rg, rbs_namespace_t **out_ns) {
2953
2963
  bool is_absolute = false;
2954
2964
 
2955
2965
  if (parser->next_token.type == pCOLON2) {
@@ -2980,7 +2990,7 @@ static bool parse_namespace(rbs_parser_t *parser, rbs_range_t *rg, rbs_namespace
2980
2990
  }
2981
2991
  }
2982
2992
 
2983
- *namespace = rbs_namespace_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), path, is_absolute);
2993
+ *out_ns = rbs_namespace_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), path, is_absolute);
2984
2994
  return true;
2985
2995
  }
2986
2996
 
@@ -2995,8 +3005,8 @@ NODISCARD
2995
3005
  static bool parse_use_clauses(rbs_parser_t *parser, rbs_node_list_t *clauses) {
2996
3006
  while (true) {
2997
3007
  rbs_range_t namespace_range = NULL_RANGE;
2998
- rbs_namespace_t *namespace = NULL;
2999
- CHECK_PARSE(parse_namespace(parser, &namespace_range, &namespace));
3008
+ rbs_namespace_t *ns = NULL;
3009
+ CHECK_PARSE(parse_namespace(parser, &namespace_range, &ns));
3000
3010
 
3001
3011
  switch (parser->next_token.type) {
3002
3012
  case tLIDENT:
@@ -3010,7 +3020,7 @@ static bool parse_use_clauses(rbs_parser_t *parser, rbs_node_list_t *clauses) {
3010
3020
 
3011
3021
  rbs_location_t *symbolLoc = rbs_location_current_token(parser);
3012
3022
  rbs_ast_symbol_t *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, INTERN_TOKEN(parser, parser->current_token));
3013
- rbs_type_name_t *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), type_name_range), namespace, symbol);
3023
+ rbs_type_name_t *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), type_name_range), ns, symbol);
3014
3024
 
3015
3025
  rbs_range_t keyword_range = NULL_RANGE;
3016
3026
  rbs_range_t new_name_range = NULL_RANGE;
@@ -3053,7 +3063,7 @@ static bool parse_use_clauses(rbs_parser_t *parser, rbs_node_list_t *clauses) {
3053
3063
  rbs_loc_add_required_child(loc, INTERN("namespace"), namespace_range);
3054
3064
  rbs_loc_add_required_child(loc, INTERN("star"), star_range);
3055
3065
 
3056
- rbs_ast_directives_use_wildcard_clause_t *clause = rbs_ast_directives_use_wildcard_clause_new(ALLOCATOR(), loc, namespace);
3066
+ rbs_ast_directives_use_wildcard_clause_t *clause = rbs_ast_directives_use_wildcard_clause_new(ALLOCATOR(), loc, ns);
3057
3067
  rbs_node_list_append(clauses, (rbs_node_t *) clause);
3058
3068
 
3059
3069
  break;
@@ -3100,8 +3110,8 @@ static bool parse_use_directive(rbs_parser_t *parser, rbs_ast_directives_use_t *
3100
3110
  }
3101
3111
 
3102
3112
  static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_t *com) {
3103
- size_t hash_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) "#", (size_t) 1);
3104
- size_t space_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) " ", (size_t) 1);
3113
+ size_t hash_bytes = parser->lexer->encoding->char_width((const uint8_t *) "#", (size_t) 1);
3114
+ size_t space_bytes = parser->lexer->encoding->char_width((const uint8_t *) " ", (size_t) 1);
3105
3115
 
3106
3116
  rbs_buffer_t rbs_buffer;
3107
3117
  rbs_buffer_init(ALLOCATOR(), &rbs_buffer);
@@ -3109,14 +3119,16 @@ static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_
3109
3119
  for (size_t i = 0; i < com->line_tokens_count; i++) {
3110
3120
  rbs_token_t tok = com->line_tokens[i];
3111
3121
 
3112
- const char *comment_start = parser->rbs_lexer_t->string.start + tok.range.start.byte_pos + hash_bytes;
3122
+ const char *comment_start = parser->lexer->string.start + tok.range.start.byte_pos + hash_bytes;
3113
3123
  size_t comment_bytes = RBS_RANGE_BYTES(tok.range) - hash_bytes;
3114
3124
 
3115
3125
  rbs_string_t str = rbs_string_new(
3116
3126
  comment_start,
3117
- parser->rbs_lexer_t->string.end
3127
+ parser->lexer->string.end
3118
3128
  );
3119
- unsigned char c = rbs_utf8_string_to_codepoint(str);
3129
+
3130
+ // Assumes the input is ASCII compatible
3131
+ unsigned char c = str.start[0];
3120
3132
 
3121
3133
  if (c == ' ') {
3122
3134
  comment_start += space_bytes;
@@ -3332,7 +3344,7 @@ void rbs_parser_advance(rbs_parser_t *parser) {
3332
3344
  break;
3333
3345
  }
3334
3346
 
3335
- parser->next_token3 = rbs_lexer_next_token(parser->rbs_lexer_t);
3347
+ parser->next_token3 = rbs_lexer_next_token(parser->lexer);
3336
3348
 
3337
3349
  if (parser->next_token3.type == tCOMMENT) {
3338
3350
  // skip
@@ -3424,7 +3436,7 @@ rbs_parser_t *rbs_parser_new(rbs_string_t string, const rbs_encoding_t *encoding
3424
3436
  rbs_parser_t *parser = rbs_allocator_alloc(allocator, rbs_parser_t);
3425
3437
 
3426
3438
  *parser = (rbs_parser_t) {
3427
- .rbs_lexer_t = lexer,
3439
+ .lexer = lexer,
3428
3440
 
3429
3441
  .current_token = NullToken,
3430
3442
  .next_token = NullToken,
data/src/string.c CHANGED
@@ -1,59 +1,10 @@
1
1
  #include "rbs/string.h"
2
- #include "rbs/defines.h"
3
2
 
4
3
  #include <stdlib.h>
5
4
  #include <string.h>
6
5
  #include <stdio.h>
7
6
  #include <ctype.h>
8
7
 
9
- unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
10
- unsigned int codepoint = 0;
11
- int remaining_bytes = 0;
12
-
13
- const char *s = string.start;
14
- const char *end = string.end;
15
-
16
- if (s >= end) return 0; // End of string
17
-
18
- if (RBS_LIKELY((*s & 0x80) == 0)) {
19
- // Single byte character (0xxxxxxx)
20
- return *s;
21
- } else if ((*s & 0xE0) == 0xC0) {
22
- // Two byte character (110xxxxx 10xxxxxx)
23
- codepoint = *s & 0x1F;
24
- remaining_bytes = 1;
25
- } else if ((*s & 0xF0) == 0xE0) {
26
- // Three byte character (1110xxxx 10xxxxxx 10xxxxxx)
27
- codepoint = *s & 0x0F;
28
- remaining_bytes = 2;
29
- } else if ((*s & 0xF8) == 0xF0) {
30
- // Four byte character (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
31
- codepoint = *s & 0x07;
32
- remaining_bytes = 3;
33
- } else {
34
- // Invalid UTF-8 sequence
35
- return 0xFFFD; // Unicode replacement character
36
- }
37
-
38
- s++;
39
- while (remaining_bytes > 0 && s < end) {
40
- if ((*s & 0xC0) != 0x80) {
41
- // Invalid continuation byte
42
- return 0xFFFD;
43
- }
44
- codepoint = (codepoint << 6) | (*s & 0x3F);
45
- s++;
46
- remaining_bytes--;
47
- }
48
-
49
- if (remaining_bytes > 0) {
50
- // Incomplete sequence
51
- return 0xFFFD;
52
- }
53
-
54
- return codepoint;
55
- }
56
-
57
8
  rbs_string_t rbs_string_new(const char *start, const char *end) {
58
9
  return (rbs_string_t) {
59
10
  .start = start,
@@ -57,7 +57,7 @@ static size_t get_system_page_size(void) {
57
57
  static rbs_allocator_page_t *rbs_allocator_page_new(size_t payload_size) {
58
58
  const size_t page_header_size = sizeof(rbs_allocator_page_t);
59
59
 
60
- rbs_allocator_page_t *page = malloc(page_header_size + payload_size);
60
+ rbs_allocator_page_t *page = (rbs_allocator_page_t *) malloc(page_header_size + payload_size);
61
61
  page->size = payload_size;
62
62
  page->used = 0;
63
63
 
@@ -65,7 +65,7 @@ static rbs_allocator_page_t *rbs_allocator_page_new(size_t payload_size) {
65
65
  }
66
66
 
67
67
  rbs_allocator_t *rbs_allocator_init(void) {
68
- rbs_allocator_t *allocator = malloc(sizeof(rbs_allocator_t));
68
+ rbs_allocator_t *allocator = (rbs_allocator_t *) malloc(sizeof(rbs_allocator_t));
69
69
 
70
70
  const size_t system_page_size = get_system_page_size();
71
71
 
@@ -6,8 +6,6 @@
6
6
  #include <stdbool.h>
7
7
 
8
8
  void rbs_assert_impl(bool condition, const char *fmt, ...) {
9
- printf("RBS_ASSERT called\n");
10
-
11
9
  if (condition) {
12
10
  return;
13
11
  }
@@ -57,8 +57,8 @@ rbs_constant_pool_resize(rbs_constant_pool_t *pool) {
57
57
  void *next = calloc(next_capacity, element_size);
58
58
  if (next == NULL) return false;
59
59
 
60
- rbs_constant_pool_bucket_t *next_buckets = next;
61
- rbs_constant_t *next_constants = (void *) (((char *) next) + next_capacity * sizeof(rbs_constant_pool_bucket_t));
60
+ rbs_constant_pool_bucket_t *next_buckets = (rbs_constant_pool_bucket_t *) next;
61
+ rbs_constant_t *next_constants = (rbs_constant_t *) (((char *) next) + next_capacity * sizeof(rbs_constant_pool_bucket_t));
62
62
 
63
63
  // For each bucket in the current constant pool, find the index in the
64
64
  // next constant pool, and insert it.
@@ -111,8 +111,8 @@ bool rbs_constant_pool_init(rbs_constant_pool_t *pool, uint32_t capacity) {
111
111
  void *memory = calloc(capacity, element_size);
112
112
  if (memory == NULL) return false;
113
113
 
114
- pool->buckets = memory;
115
- pool->constants = (void *) (((char *) memory) + capacity * sizeof(rbs_constant_pool_bucket_t));
114
+ pool->buckets = (rbs_constant_pool_bucket_t *) memory;
115
+ pool->constants = (rbs_constant_t *) (((char *) memory) + capacity * sizeof(rbs_constant_pool_bucket_t));
116
116
  pool->size = 0;
117
117
  pool->capacity = capacity;
118
118
  return true;
@@ -1,4 +1,5 @@
1
1
  #include "rbs/util/rbs_unescape.h"
2
+ #include "rbs/util/rbs_encoding.h"
2
3
  #include <string.h>
3
4
  #include <stdlib.h>
4
5
  #include <ctype.h>
@@ -42,20 +43,44 @@ static int octal_to_int(const char *octal, int length) {
42
43
  return result;
43
44
  }
44
45
 
45
- int rbs_utf8_codelen(unsigned int c) {
46
- if (c <= 0x7F) return 1;
47
- if (c <= 0x7FF) return 2;
48
- if (c <= 0xFFFF) return 3;
49
- if (c <= 0x10FFFF) return 4;
50
- return 1; // Invalid Unicode codepoint, treat as 1 byte
46
+ // Fills buf starting at index 'start' with the UTF-8 encoding of 'codepoint'.
47
+ // Returns the number of bytes written, or 0 when the output is not changed.
48
+ //
49
+ size_t rbs_utf8_fill_codepoint(char *buf, size_t start, size_t end, unsigned int codepoint) {
50
+ if (start + 4 > end) {
51
+ return 0;
52
+ }
53
+
54
+ if (codepoint <= 0x7F) {
55
+ buf[start] = codepoint & 0x7F;
56
+ return 1;
57
+ } else if (codepoint <= 0x7FF) {
58
+ buf[start + 0] = 0xC0 | ((codepoint >> 6) & 0x1F);
59
+ buf[start + 1] = 0x80 | (codepoint & 0x3F);
60
+ return 2;
61
+ } else if (codepoint <= 0xFFFF) {
62
+ buf[start + 0] = 0xE0 | ((codepoint >> 12) & 0x0F);
63
+ buf[start + 1] = 0x80 | ((codepoint >> 6) & 0x3F);
64
+ buf[start + 2] = 0x80 | (codepoint & 0x3F);
65
+ return 3;
66
+ } else if (codepoint <= 0x10FFFF) {
67
+ buf[start + 0] = 0xF0 | ((codepoint >> 18) & 0x07);
68
+ buf[start + 1] = 0x80 | ((codepoint >> 12) & 0x3F);
69
+ buf[start + 2] = 0x80 | ((codepoint >> 6) & 0x3F);
70
+ buf[start + 3] = 0x80 | (codepoint & 0x3F);
71
+ return 4;
72
+ } else {
73
+ return 0;
74
+ }
51
75
  }
52
76
 
53
- rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote) {
77
+ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote, bool is_unicode) {
54
78
  if (!string.start) return RBS_STRING_NULL;
55
79
 
56
80
  size_t len = string.end - string.start;
57
81
  const char *input = string.start;
58
82
 
83
+ // The output cannot be longer than the input even after unescaping.
59
84
  char *output = rbs_allocator_alloc_many(allocator, len + 1, char);
60
85
  if (!output) return RBS_STRING_NULL;
61
86
 
@@ -79,9 +104,21 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
79
104
  i += hex_len + 2;
80
105
  } else if (input[i + 1] == 'u' && i + 5 < len) {
81
106
  // Unicode escape
82
- int value = hex_to_int(input + i + 2, 4);
83
- output[j++] = (char) value;
84
- i += 6;
107
+
108
+ if (is_unicode) {
109
+ // The UTF-8 representation is at most 4 bytes, shorter than the input length.
110
+ int value = hex_to_int(input + i + 2, 4);
111
+ j += rbs_utf8_fill_codepoint(output, j, len + 1, value);
112
+ i += 6;
113
+ } else {
114
+ // Copy the escape sequence as-is
115
+ output[j++] = input[i++];
116
+ output[j++] = input[i++];
117
+ output[j++] = input[i++];
118
+ output[j++] = input[i++];
119
+ output[j++] = input[i++];
120
+ output[j++] = input[i++];
121
+ }
85
122
  } else {
86
123
  // Other escapes
87
124
  int found = 0;
@@ -114,18 +151,17 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
114
151
  return rbs_string_new(output, output + j);
115
152
  }
116
153
 
117
- rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input) {
118
- unsigned int first_char = rbs_utf8_string_to_codepoint(input);
119
- size_t byte_length = rbs_string_len(input);
154
+ rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input, const rbs_encoding_t *encoding) {
155
+ unsigned int first_char = input.start[0];
156
+
157
+ const char *new_start = input.start;
158
+ const char *new_end = input.end;
120
159
 
121
- ptrdiff_t start_offset = 0;
122
160
  if (first_char == '"' || first_char == '\'' || first_char == '`') {
123
- int bs = rbs_utf8_codelen(first_char);
124
- start_offset += bs;
125
- byte_length -= 2 * bs;
161
+ new_start += 1;
162
+ new_end -= 1;
126
163
  }
127
164
 
128
- const char *new_start = input.start + start_offset;
129
- rbs_string_t string = rbs_string_new(new_start, new_start + byte_length);
130
- return unescape_string(allocator, string, first_char == '"');
165
+ rbs_string_t string = rbs_string_new(new_start, new_end);
166
+ return unescape_string(allocator, string, first_char == '"', encoding == RBS_ENCODING_UTF_8_ENTRY);
131
167
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbs
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.10.0.pre.1
4
+ version: 3.10.0.pre.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Soutaro Matsumoto
@@ -139,6 +139,7 @@ files:
139
139
  - docs/architecture.md
140
140
  - docs/collection.md
141
141
  - docs/data_and_struct.md
142
+ - docs/encoding.md
142
143
  - docs/gem.md
143
144
  - docs/rbs_by_example.md
144
145
  - docs/repo.md
@@ -560,7 +561,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
560
561
  - !ruby/object:Gem::Version
561
562
  version: '0'
562
563
  requirements: []
563
- rubygems_version: 4.0.0.dev
564
+ rubygems_version: 4.0.1
564
565
  specification_version: 4
565
566
  summary: Type signature for Ruby.
566
567
  test_files: []