jruby-prism-parser 0.24.0-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/BSDmakefile +58 -0
  3. data/CHANGELOG.md +269 -1
  4. data/CONTRIBUTING.md +0 -4
  5. data/Makefile +25 -18
  6. data/README.md +57 -6
  7. data/config.yml +1724 -140
  8. data/docs/build_system.md +39 -11
  9. data/docs/configuration.md +4 -0
  10. data/docs/cruby_compilation.md +1 -1
  11. data/docs/fuzzing.md +1 -1
  12. data/docs/parser_translation.md +14 -9
  13. data/docs/parsing_rules.md +4 -1
  14. data/docs/releasing.md +8 -10
  15. data/docs/relocation.md +34 -0
  16. data/docs/ripper_translation.md +72 -0
  17. data/docs/ruby_api.md +2 -1
  18. data/docs/serialization.md +29 -5
  19. data/ext/prism/api_node.c +3395 -1999
  20. data/ext/prism/api_pack.c +9 -0
  21. data/ext/prism/extconf.rb +55 -34
  22. data/ext/prism/extension.c +597 -346
  23. data/ext/prism/extension.h +6 -5
  24. data/include/prism/ast.h +2612 -455
  25. data/include/prism/defines.h +160 -2
  26. data/include/prism/diagnostic.h +188 -76
  27. data/include/prism/encoding.h +22 -4
  28. data/include/prism/node.h +89 -17
  29. data/include/prism/options.h +224 -12
  30. data/include/prism/pack.h +11 -0
  31. data/include/prism/parser.h +267 -66
  32. data/include/prism/prettyprint.h +8 -0
  33. data/include/prism/regexp.h +18 -8
  34. data/include/prism/static_literals.h +121 -0
  35. data/include/prism/util/pm_buffer.h +75 -2
  36. data/include/prism/util/pm_char.h +1 -2
  37. data/include/prism/util/pm_constant_pool.h +18 -9
  38. data/include/prism/util/pm_integer.h +126 -0
  39. data/include/prism/util/pm_list.h +1 -1
  40. data/include/prism/util/pm_newline_list.h +19 -0
  41. data/include/prism/util/pm_string.h +48 -8
  42. data/include/prism/version.h +3 -3
  43. data/include/prism.h +99 -5
  44. data/jruby-prism.jar +0 -0
  45. data/lib/prism/compiler.rb +11 -1
  46. data/lib/prism/desugar_compiler.rb +113 -74
  47. data/lib/prism/dispatcher.rb +45 -1
  48. data/lib/prism/dot_visitor.rb +201 -77
  49. data/lib/prism/dsl.rb +673 -461
  50. data/lib/prism/ffi.rb +233 -45
  51. data/lib/prism/inspect_visitor.rb +2389 -0
  52. data/lib/prism/lex_compat.rb +35 -16
  53. data/lib/prism/mutation_compiler.rb +24 -8
  54. data/lib/prism/node.rb +7731 -8460
  55. data/lib/prism/node_ext.rb +328 -32
  56. data/lib/prism/pack.rb +4 -0
  57. data/lib/prism/parse_result/comments.rb +34 -24
  58. data/lib/prism/parse_result/errors.rb +65 -0
  59. data/lib/prism/parse_result/newlines.rb +102 -12
  60. data/lib/prism/parse_result.rb +448 -44
  61. data/lib/prism/pattern.rb +28 -10
  62. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  63. data/lib/prism/polyfill/byteindex.rb +13 -0
  64. data/lib/prism/polyfill/unpack1.rb +14 -0
  65. data/lib/prism/reflection.rb +413 -0
  66. data/lib/prism/relocation.rb +504 -0
  67. data/lib/prism/serialize.rb +1940 -1198
  68. data/lib/prism/string_query.rb +30 -0
  69. data/lib/prism/translation/parser/builder.rb +61 -0
  70. data/lib/prism/translation/parser/compiler.rb +569 -195
  71. data/lib/prism/translation/parser/lexer.rb +516 -39
  72. data/lib/prism/translation/parser.rb +177 -12
  73. data/lib/prism/translation/parser33.rb +1 -1
  74. data/lib/prism/translation/parser34.rb +1 -1
  75. data/lib/prism/translation/parser35.rb +12 -0
  76. data/lib/prism/translation/ripper/sexp.rb +125 -0
  77. data/lib/prism/translation/ripper/shim.rb +5 -0
  78. data/lib/prism/translation/ripper.rb +3224 -462
  79. data/lib/prism/translation/ruby_parser.rb +194 -69
  80. data/lib/prism/translation.rb +4 -1
  81. data/lib/prism/version.rb +1 -1
  82. data/lib/prism/visitor.rb +13 -0
  83. data/lib/prism.rb +17 -27
  84. data/prism.gemspec +57 -17
  85. data/rbi/prism/compiler.rbi +12 -0
  86. data/rbi/prism/dsl.rbi +524 -0
  87. data/rbi/prism/inspect_visitor.rbi +12 -0
  88. data/rbi/prism/node.rbi +8722 -0
  89. data/rbi/prism/node_ext.rbi +107 -0
  90. data/rbi/prism/parse_result.rbi +404 -0
  91. data/rbi/prism/reflection.rbi +58 -0
  92. data/rbi/prism/string_query.rbi +12 -0
  93. data/rbi/prism/translation/parser.rbi +11 -0
  94. data/rbi/prism/translation/parser33.rbi +6 -0
  95. data/rbi/prism/translation/parser34.rbi +6 -0
  96. data/rbi/prism/translation/parser35.rbi +6 -0
  97. data/rbi/prism/translation/ripper.rbi +15 -0
  98. data/rbi/prism/visitor.rbi +473 -0
  99. data/rbi/prism.rbi +44 -7745
  100. data/sig/prism/compiler.rbs +9 -0
  101. data/sig/prism/dispatcher.rbs +16 -0
  102. data/sig/prism/dot_visitor.rbs +6 -0
  103. data/sig/prism/dsl.rbs +351 -0
  104. data/sig/prism/inspect_visitor.rbs +22 -0
  105. data/sig/prism/lex_compat.rbs +10 -0
  106. data/sig/prism/mutation_compiler.rbs +159 -0
  107. data/sig/prism/node.rbs +3614 -0
  108. data/sig/prism/node_ext.rbs +82 -0
  109. data/sig/prism/pack.rbs +43 -0
  110. data/sig/prism/parse_result.rbs +192 -0
  111. data/sig/prism/pattern.rbs +13 -0
  112. data/sig/prism/reflection.rbs +50 -0
  113. data/sig/prism/relocation.rbs +185 -0
  114. data/sig/prism/serialize.rbs +8 -0
  115. data/sig/prism/string_query.rbs +11 -0
  116. data/sig/prism/visitor.rbs +169 -0
  117. data/sig/prism.rbs +248 -4767
  118. data/src/diagnostic.c +672 -230
  119. data/src/encoding.c +211 -108
  120. data/src/node.c +7541 -1653
  121. data/src/options.c +135 -20
  122. data/src/pack.c +33 -17
  123. data/src/prettyprint.c +1543 -1485
  124. data/src/prism.c +7813 -3050
  125. data/src/regexp.c +225 -73
  126. data/src/serialize.c +101 -77
  127. data/src/static_literals.c +617 -0
  128. data/src/token_type.c +14 -13
  129. data/src/util/pm_buffer.c +187 -20
  130. data/src/util/pm_char.c +5 -5
  131. data/src/util/pm_constant_pool.c +39 -19
  132. data/src/util/pm_integer.c +670 -0
  133. data/src/util/pm_list.c +1 -1
  134. data/src/util/pm_newline_list.c +43 -5
  135. data/src/util/pm_string.c +213 -33
  136. data/src/util/pm_strncasecmp.c +13 -1
  137. data/src/util/pm_strpbrk.c +32 -6
  138. metadata +55 -19
  139. data/docs/ripper.md +0 -36
  140. data/include/prism/util/pm_state_stack.h +0 -42
  141. data/include/prism/util/pm_string_list.h +0 -44
  142. data/lib/prism/debug.rb +0 -206
  143. data/lib/prism/node_inspector.rb +0 -68
  144. data/lib/prism/translation/parser/rubocop.rb +0 -45
  145. data/rbi/prism_static.rbi +0 -207
  146. data/sig/prism_static.rbs +0 -201
  147. data/src/util/pm_state_stack.c +0 -25
  148. data/src/util/pm_string_list.c +0 -28
@@ -6,7 +6,7 @@
6
6
  */
7
7
  bool
8
8
  pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity) {
9
- list->offsets = (size_t *) calloc(capacity, sizeof(size_t));
9
+ list->offsets = (size_t *) xcalloc(capacity, sizeof(size_t));
10
10
  if (list->offsets == NULL) return false;
11
11
 
12
12
  list->start = start;
@@ -19,6 +19,14 @@ pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capac
19
19
  return true;
20
20
  }
21
21
 
22
+ /**
23
+ * Clear out the newlines that have been appended to the list.
24
+ */
25
+ void
26
+ pm_newline_list_clear(pm_newline_list_t *list) {
27
+ list->size = 1;
28
+ }
29
+
22
30
  /**
23
31
  * Append a new offset to the newline list. Returns true if the reallocation of
24
32
  * the offsets succeeds (if one was necessary), otherwise returns false.
@@ -29,10 +37,11 @@ pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) {
29
37
  size_t *original_offsets = list->offsets;
30
38
 
31
39
  list->capacity = (list->capacity * 3) / 2;
32
- list->offsets = (size_t *) calloc(list->capacity, sizeof(size_t));
33
- memcpy(list->offsets, original_offsets, list->size * sizeof(size_t));
34
- free(original_offsets);
40
+ list->offsets = (size_t *) xcalloc(list->capacity, sizeof(size_t));
35
41
  if (list->offsets == NULL) return false;
42
+
43
+ memcpy(list->offsets, original_offsets, list->size * sizeof(size_t));
44
+ xfree(original_offsets);
36
45
  }
37
46
 
38
47
  assert(*cursor == '\n');
@@ -45,6 +54,35 @@ pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) {
45
54
  return true;
46
55
  }
47
56
 
57
+ /**
58
+ * Returns the line of the given offset. If the offset is not in the list, the
59
+ * line of the closest offset less than the given offset is returned.
60
+ */
61
+ int32_t
62
+ pm_newline_list_line(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) {
63
+ assert(cursor >= list->start);
64
+ size_t offset = (size_t) (cursor - list->start);
65
+
66
+ size_t left = 0;
67
+ size_t right = list->size - 1;
68
+
69
+ while (left <= right) {
70
+ size_t mid = left + (right - left) / 2;
71
+
72
+ if (list->offsets[mid] == offset) {
73
+ return ((int32_t) mid) + start_line;
74
+ }
75
+
76
+ if (list->offsets[mid] < offset) {
77
+ left = mid + 1;
78
+ } else {
79
+ right = mid - 1;
80
+ }
81
+ }
82
+
83
+ return ((int32_t) left) + start_line - 1;
84
+ }
85
+
48
86
  /**
49
87
  * Returns the line and column of the given offset. If the offset is not in the
50
88
  * list, the line and column of the closest offset less than the given offset
@@ -83,5 +121,5 @@ pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor
83
121
  */
84
122
  void
85
123
  pm_newline_list_free(pm_newline_list_t *list) {
86
- free(list->offsets);
124
+ xfree(list->offsets);
87
125
  }
data/src/util/pm_string.c CHANGED
@@ -47,6 +47,62 @@ pm_string_constant_init(pm_string_t *string, const char *source, size_t length)
47
47
  };
48
48
  }
49
49
 
50
+ #ifdef _WIN32
51
+ /**
52
+ * Represents a file handle on Windows, where the path will need to be freed
53
+ * when the file is closed.
54
+ */
55
+ typedef struct {
56
+ /** The path to the file, which will become allocated memory. */
57
+ WCHAR *path;
58
+
59
+ /** The handle to the file, which will start as uninitialized memory. */
60
+ HANDLE file;
61
+ } pm_string_file_handle_t;
62
+
63
+ /**
64
+ * Open the file indicated by the filepath parameter for reading on Windows.
65
+ * Perform any kind of normalization that needs to happen on the filepath.
66
+ */
67
+ static pm_string_init_result_t
68
+ pm_string_file_handle_open(pm_string_file_handle_t *handle, const char *filepath) {
69
+ int length = MultiByteToWideChar(CP_UTF8, 0, filepath, -1, NULL, 0);
70
+ if (length == 0) return PM_STRING_INIT_ERROR_GENERIC;
71
+
72
+ handle->path = xmalloc(sizeof(WCHAR) * ((size_t) length));
73
+ if ((handle->path == NULL) || (MultiByteToWideChar(CP_UTF8, 0, filepath, -1, handle->path, length) == 0)) {
74
+ xfree(handle->path);
75
+ return PM_STRING_INIT_ERROR_GENERIC;
76
+ }
77
+
78
+ handle->file = CreateFileW(handle->path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
79
+ if (handle->file == INVALID_HANDLE_VALUE) {
80
+ pm_string_init_result_t result = PM_STRING_INIT_ERROR_GENERIC;
81
+
82
+ if (GetLastError() == ERROR_ACCESS_DENIED) {
83
+ DWORD attributes = GetFileAttributesW(handle->path);
84
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
85
+ result = PM_STRING_INIT_ERROR_DIRECTORY;
86
+ }
87
+ }
88
+
89
+ xfree(handle->path);
90
+ return result;
91
+ }
92
+
93
+ return PM_STRING_INIT_SUCCESS;
94
+ }
95
+
96
+ /**
97
+ * Close the file handle and free the path.
98
+ */
99
+ static void
100
+ pm_string_file_handle_close(pm_string_file_handle_t *handle) {
101
+ xfree(handle->path);
102
+ CloseHandle(handle->file);
103
+ }
104
+ #endif
105
+
50
106
  /**
51
107
  * Read the file indicated by the filepath parameter into source and load its
52
108
  * contents and size into the given `pm_string_t`. The given `pm_string_t`
@@ -58,62 +114,66 @@ pm_string_constant_init(pm_string_t *string, const char *source, size_t length)
58
114
  * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
59
115
  * `mmap`, and on other POSIX systems we'll use `read`.
60
116
  */
61
- bool
117
+ PRISM_EXPORTED_FUNCTION pm_string_init_result_t
62
118
  pm_string_mapped_init(pm_string_t *string, const char *filepath) {
63
119
  #ifdef _WIN32
64
120
  // Open the file for reading.
65
- HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
66
-
67
- if (file == INVALID_HANDLE_VALUE) {
68
- return false;
69
- }
121
+ pm_string_file_handle_t handle;
122
+ pm_string_init_result_t result = pm_string_file_handle_open(&handle, filepath);
123
+ if (result != PM_STRING_INIT_SUCCESS) return result;
70
124
 
71
125
  // Get the file size.
72
- DWORD file_size = GetFileSize(file, NULL);
126
+ DWORD file_size = GetFileSize(handle.file, NULL);
73
127
  if (file_size == INVALID_FILE_SIZE) {
74
- CloseHandle(file);
75
- return false;
128
+ pm_string_file_handle_close(&handle);
129
+ return PM_STRING_INIT_ERROR_GENERIC;
76
130
  }
77
131
 
78
132
  // If the file is empty, then we don't need to do anything else, we'll set
79
133
  // the source to a constant empty string and return.
80
134
  if (file_size == 0) {
81
- CloseHandle(file);
135
+ pm_string_file_handle_close(&handle);
82
136
  const uint8_t source[] = "";
83
137
  *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
84
- return true;
138
+ return PM_STRING_INIT_SUCCESS;
85
139
  }
86
140
 
87
141
  // Create a mapping of the file.
88
- HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
142
+ HANDLE mapping = CreateFileMapping(handle.file, NULL, PAGE_READONLY, 0, 0, NULL);
89
143
  if (mapping == NULL) {
90
- CloseHandle(file);
91
- return false;
144
+ pm_string_file_handle_close(&handle);
145
+ return PM_STRING_INIT_ERROR_GENERIC;
92
146
  }
93
147
 
94
148
  // Map the file into memory.
95
149
  uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
96
150
  CloseHandle(mapping);
97
- CloseHandle(file);
151
+ pm_string_file_handle_close(&handle);
98
152
 
99
153
  if (source == NULL) {
100
- return false;
154
+ return PM_STRING_INIT_ERROR_GENERIC;
101
155
  }
102
156
 
103
157
  *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = (size_t) file_size };
104
- return true;
105
- #else
158
+ return PM_STRING_INIT_SUCCESS;
159
+ #elif defined(_POSIX_MAPPED_FILES)
106
160
  // Open the file for reading
107
161
  int fd = open(filepath, O_RDONLY);
108
162
  if (fd == -1) {
109
- return false;
163
+ return PM_STRING_INIT_ERROR_GENERIC;
110
164
  }
111
165
 
112
166
  // Stat the file to get the file size
113
167
  struct stat sb;
114
168
  if (fstat(fd, &sb) == -1) {
115
169
  close(fd);
116
- return false;
170
+ return PM_STRING_INIT_ERROR_GENERIC;
171
+ }
172
+
173
+ // Ensure it is a file and not a directory
174
+ if (S_ISDIR(sb.st_mode)) {
175
+ close(fd);
176
+ return PM_STRING_INIT_ERROR_DIRECTORY;
117
177
  }
118
178
 
119
179
  // mmap the file descriptor to virtually get the contents
@@ -124,30 +184,128 @@ pm_string_mapped_init(pm_string_t *string, const char *filepath) {
124
184
  close(fd);
125
185
  const uint8_t source[] = "";
126
186
  *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
127
- return true;
187
+ return PM_STRING_INIT_SUCCESS;
128
188
  }
129
189
 
130
190
  source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
131
191
  if (source == MAP_FAILED) {
132
- return false;
192
+ close(fd);
193
+ return PM_STRING_INIT_ERROR_GENERIC;
133
194
  }
134
195
 
135
196
  close(fd);
136
197
  *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = size };
137
- return true;
198
+ return PM_STRING_INIT_SUCCESS;
199
+ #else
200
+ return pm_string_file_init(string, filepath);
138
201
  #endif
139
202
  }
140
203
 
141
204
  /**
142
- * Returns the memory size associated with the string.
205
+ * Read the file indicated by the filepath parameter into source and load its
206
+ * contents and size into the given `pm_string_t`. The given `pm_string_t`
207
+ * should be freed using `pm_string_free` when it is no longer used.
143
208
  */
144
- size_t
145
- pm_string_memsize(const pm_string_t *string) {
146
- size_t size = sizeof(pm_string_t);
147
- if (string->type == PM_STRING_OWNED) {
148
- size += string->length;
209
+ PRISM_EXPORTED_FUNCTION pm_string_init_result_t
210
+ pm_string_file_init(pm_string_t *string, const char *filepath) {
211
+ #ifdef _WIN32
212
+ // Open the file for reading.
213
+ pm_string_file_handle_t handle;
214
+ pm_string_init_result_t result = pm_string_file_handle_open(&handle, filepath);
215
+ if (result != PM_STRING_INIT_SUCCESS) return result;
216
+
217
+ // Get the file size.
218
+ DWORD file_size = GetFileSize(handle.file, NULL);
219
+ if (file_size == INVALID_FILE_SIZE) {
220
+ pm_string_file_handle_close(&handle);
221
+ return PM_STRING_INIT_ERROR_GENERIC;
222
+ }
223
+
224
+ // If the file is empty, then we don't need to do anything else, we'll set
225
+ // the source to a constant empty string and return.
226
+ if (file_size == 0) {
227
+ pm_string_file_handle_close(&handle);
228
+ const uint8_t source[] = "";
229
+ *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
230
+ return PM_STRING_INIT_SUCCESS;
231
+ }
232
+
233
+ // Create a buffer to read the file into.
234
+ uint8_t *source = xmalloc(file_size);
235
+ if (source == NULL) {
236
+ pm_string_file_handle_close(&handle);
237
+ return PM_STRING_INIT_ERROR_GENERIC;
238
+ }
239
+
240
+ // Read the contents of the file
241
+ DWORD bytes_read;
242
+ if (!ReadFile(handle.file, source, file_size, &bytes_read, NULL)) {
243
+ pm_string_file_handle_close(&handle);
244
+ return PM_STRING_INIT_ERROR_GENERIC;
245
+ }
246
+
247
+ // Check the number of bytes read
248
+ if (bytes_read != file_size) {
249
+ xfree(source);
250
+ pm_string_file_handle_close(&handle);
251
+ return PM_STRING_INIT_ERROR_GENERIC;
252
+ }
253
+
254
+ pm_string_file_handle_close(&handle);
255
+ *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = (size_t) file_size };
256
+ return PM_STRING_INIT_SUCCESS;
257
+ #elif defined(PRISM_HAS_FILESYSTEM)
258
+ // Open the file for reading
259
+ int fd = open(filepath, O_RDONLY);
260
+ if (fd == -1) {
261
+ return PM_STRING_INIT_ERROR_GENERIC;
262
+ }
263
+
264
+ // Stat the file to get the file size
265
+ struct stat sb;
266
+ if (fstat(fd, &sb) == -1) {
267
+ close(fd);
268
+ return PM_STRING_INIT_ERROR_GENERIC;
269
+ }
270
+
271
+ // Ensure it is a file and not a directory
272
+ if (S_ISDIR(sb.st_mode)) {
273
+ close(fd);
274
+ return PM_STRING_INIT_ERROR_DIRECTORY;
275
+ }
276
+
277
+ // Check the size to see if it's empty
278
+ size_t size = (size_t) sb.st_size;
279
+ if (size == 0) {
280
+ close(fd);
281
+ const uint8_t source[] = "";
282
+ *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
283
+ return PM_STRING_INIT_SUCCESS;
284
+ }
285
+
286
+ size_t length = (size_t) size;
287
+ uint8_t *source = xmalloc(length);
288
+ if (source == NULL) {
289
+ close(fd);
290
+ return PM_STRING_INIT_ERROR_GENERIC;
291
+ }
292
+
293
+ long bytes_read = (long) read(fd, source, length);
294
+ close(fd);
295
+
296
+ if (bytes_read == -1) {
297
+ xfree(source);
298
+ return PM_STRING_INIT_ERROR_GENERIC;
149
299
  }
150
- return size;
300
+
301
+ *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = length };
302
+ return PM_STRING_INIT_SUCCESS;
303
+ #else
304
+ (void) string;
305
+ (void) filepath;
306
+ perror("pm_string_file_init is not implemented for this platform");
307
+ return PM_STRING_INIT_ERROR_GENERIC;
308
+ #endif
151
309
  }
152
310
 
153
311
  /**
@@ -161,13 +319,33 @@ pm_string_ensure_owned(pm_string_t *string) {
161
319
  size_t length = pm_string_length(string);
162
320
  const uint8_t *source = pm_string_source(string);
163
321
 
164
- uint8_t *memory = malloc(length);
322
+ uint8_t *memory = xmalloc(length);
165
323
  if (!memory) return;
166
324
 
167
325
  pm_string_owned_init(string, memory, length);
168
326
  memcpy((void *) string->source, source, length);
169
327
  }
170
328
 
329
+ /**
330
+ * Compare the underlying lengths and bytes of two strings. Returns 0 if the
331
+ * strings are equal, a negative number if the left string is less than the
332
+ * right string, and a positive number if the left string is greater than the
333
+ * right string.
334
+ */
335
+ int
336
+ pm_string_compare(const pm_string_t *left, const pm_string_t *right) {
337
+ size_t left_length = pm_string_length(left);
338
+ size_t right_length = pm_string_length(right);
339
+
340
+ if (left_length < right_length) {
341
+ return -1;
342
+ } else if (left_length > right_length) {
343
+ return 1;
344
+ }
345
+
346
+ return memcmp(pm_string_source(left), pm_string_source(right), left_length);
347
+ }
348
+
171
349
  /**
172
350
  * Returns the length associated with the string.
173
351
  */
@@ -192,12 +370,14 @@ pm_string_free(pm_string_t *string) {
192
370
  void *memory = (void *) string->source;
193
371
 
194
372
  if (string->type == PM_STRING_OWNED) {
195
- free(memory);
373
+ xfree(memory);
374
+ #ifdef PRISM_HAS_MMAP
196
375
  } else if (string->type == PM_STRING_MAPPED && string->length) {
197
376
  #if defined(_WIN32)
198
377
  UnmapViewOfFile(memory);
199
- #else
378
+ #elif defined(_POSIX_MAPPED_FILES)
200
379
  munmap(memory, string->length);
201
380
  #endif
381
+ #endif /* PRISM_HAS_MMAP */
202
382
  }
203
383
  }
@@ -1,5 +1,17 @@
1
1
  #include "prism/util/pm_strncasecmp.h"
2
2
 
3
+ /**
4
+ * A locale-insensitive version of `tolower(3)`
5
+ */
6
+ static inline int
7
+ pm_tolower(int c)
8
+ {
9
+ if ('A' <= c && c <= 'Z') {
10
+ return c | 0x20;
11
+ }
12
+ return c;
13
+ }
14
+
3
15
  /**
4
16
  * Compare two strings, ignoring case, up to the given length. Returns 0 if the
5
17
  * strings are equal, a negative number if string1 is less than string2, or a
@@ -16,7 +28,7 @@ pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length) {
16
28
 
17
29
  while (offset < length && string1[offset] != '\0') {
18
30
  if (string2[offset] == '\0') return string1[offset];
19
- if ((difference = tolower(string1[offset]) - tolower(string2[offset])) != 0) return difference;
31
+ if ((difference = pm_tolower(string1[offset]) - pm_tolower(string2[offset])) != 0) return difference;
20
32
  offset++;
21
33
  }
22
34
 
@@ -8,6 +8,27 @@ pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start
8
8
  pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
9
9
  }
10
10
 
11
+ /**
12
+ * Set the explicit encoding for the parser to the current encoding.
13
+ */
14
+ static inline void
15
+ pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
16
+ if (parser->explicit_encoding != NULL) {
17
+ if (parser->explicit_encoding == parser->encoding) {
18
+ // Okay, we already locked to this encoding.
19
+ } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
20
+ // Not okay, we already found a Unicode escape sequence and this
21
+ // conflicts.
22
+ pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
23
+ } else {
24
+ // Should not be anything else.
25
+ assert(false && "unreachable");
26
+ }
27
+ }
28
+
29
+ parser->explicit_encoding = parser->encoding;
30
+ }
31
+
11
32
  /**
12
33
  * This is the default path.
13
34
  */
@@ -52,7 +73,7 @@ pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *chars
52
73
  * This is the path when the encoding is ASCII-8BIT.
53
74
  */
54
75
  static inline const uint8_t *
55
- pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) {
76
+ pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
56
77
  size_t index = 0;
57
78
 
58
79
  while (index < maximum) {
@@ -60,6 +81,7 @@ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maxi
60
81
  return source + index;
61
82
  }
62
83
 
84
+ if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
63
85
  index++;
64
86
  }
65
87
 
@@ -72,6 +94,7 @@ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maxi
72
94
  static inline const uint8_t *
73
95
  pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
74
96
  size_t index = 0;
97
+ const pm_encoding_t *encoding = parser->encoding;
75
98
 
76
99
  while (index < maximum) {
77
100
  if (strchr((const char *) charset, source[index]) != NULL) {
@@ -81,7 +104,8 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
81
104
  if (source[index] < 0x80) {
82
105
  index++;
83
106
  } else {
84
- size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
107
+ size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
108
+ if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
85
109
 
86
110
  if (width > 0) {
87
111
  index += width;
@@ -96,7 +120,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
96
120
 
97
121
  do {
98
122
  index++;
99
- } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
123
+ } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
100
124
 
101
125
  pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
102
126
  }
@@ -113,6 +137,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
113
137
  static inline const uint8_t *
114
138
  pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
115
139
  size_t index = 0;
140
+ const pm_encoding_t *encoding = parser->encoding;
116
141
 
117
142
  while (index < maximum) {
118
143
  if (strchr((const char *) charset, source[index]) != NULL) {
@@ -122,7 +147,8 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
122
147
  if (source[index] < 0x80 || !validate) {
123
148
  index++;
124
149
  } else {
125
- size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
150
+ size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
151
+ pm_strpbrk_explicit_encoding_set(parser, source, width);
126
152
 
127
153
  if (width > 0) {
128
154
  index += width;
@@ -135,7 +161,7 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
135
161
 
136
162
  do {
137
163
  index++;
138
- } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
164
+ } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
139
165
 
140
166
  pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
141
167
  }
@@ -171,7 +197,7 @@ pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, p
171
197
  } else if (!parser->encoding_changed) {
172
198
  return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
173
199
  } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
174
- return pm_strpbrk_ascii_8bit(source, charset, (size_t) length);
200
+ return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
175
201
  } else if (parser->encoding->multibyte) {
176
202
  return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
177
203
  } else {