@ast-grep/lang-scala 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,420 @@
1
+ #include "tree_sitter/alloc.h"
2
+ #include "tree_sitter/array.h"
3
+ #include "tree_sitter/parser.h"
4
+
5
+ #include <wctype.h>
6
+
7
+ // #define DEBUG
8
+
9
+ #ifdef DEBUG
10
+ #define LOG(...) fprintf(stderr, __VA_ARGS__)
11
+ #else
12
+ #define LOG(...)
13
+ #endif
14
+
15
+ enum TokenType {
16
+ AUTOMATIC_SEMICOLON,
17
+ INDENT,
18
+ INTERPOLATED_STRING_MIDDLE,
19
+ INTERPOLATED_STRING_END,
20
+ INTERPOLATED_MULTILINE_STRING_MIDDLE,
21
+ INTERPOLATED_MULTILINE_STRING_END,
22
+ OUTDENT,
23
+ SIMPLE_MULTILINE_STRING,
24
+ SIMPLE_STRING,
25
+ ELSE,
26
+ CATCH,
27
+ FINALLY,
28
+ EXTENDS,
29
+ DERIVES,
30
+ WITH,
31
+ };
32
+
33
+ typedef struct {
34
+ Array(int16_t) indents;
35
+ int16_t last_indentation_size;
36
+ int16_t last_newline_count;
37
+ int16_t last_column;
38
+ } Scanner;
39
+
40
+ void *tree_sitter_scala_external_scanner_create() {
41
+ Scanner *scanner = ts_calloc(1, sizeof(Scanner));
42
+ array_init(&scanner->indents);
43
+ scanner->last_indentation_size = -1;
44
+ scanner->last_column = -1;
45
+ return scanner;
46
+ }
47
+
48
+ void tree_sitter_scala_external_scanner_destroy(void *payload) {
49
+ Scanner *scanner = payload;
50
+ array_delete(&scanner->indents);
51
+ ts_free(scanner);
52
+ }
53
+
54
+ unsigned tree_sitter_scala_external_scanner_serialize(void *payload, char *buffer) {
55
+ Scanner *scanner = (Scanner*)payload;
56
+
57
+ if ((scanner->indents.size + 3) * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
58
+ return 0;
59
+ }
60
+
61
+ size_t size = 0;
62
+ memcpy(buffer + size, &scanner->last_indentation_size, sizeof(int16_t));
63
+ size += sizeof(int16_t);
64
+ memcpy(buffer + size, &scanner->last_newline_count, sizeof(int16_t));
65
+ size += sizeof(int16_t);
66
+ memcpy(buffer + size, &scanner->last_column, sizeof(int16_t));
67
+ size += sizeof(int16_t);
68
+
69
+ for (unsigned i = 0; i < scanner->indents.size; i++) {
70
+ memcpy(buffer + size, &scanner->indents.contents[i], sizeof(int16_t));
71
+ size += sizeof(int16_t);
72
+ }
73
+
74
+ return size;
75
+ }
76
+
77
+ void tree_sitter_scala_external_scanner_deserialize(void *payload, const char *buffer,
78
+ unsigned length) {
79
+ Scanner *scanner = (Scanner*)payload;
80
+ array_clear(&scanner->indents);
81
+ scanner->last_indentation_size = -1;
82
+ scanner->last_column = -1;
83
+ scanner->last_newline_count = 0;
84
+
85
+ if (length == 0) {
86
+ return;
87
+ }
88
+
89
+ size_t size = 0;
90
+
91
+ scanner->last_indentation_size = *(int16_t *)&buffer[size];
92
+ size += sizeof(int16_t);
93
+ scanner->last_newline_count = *(int16_t *)&buffer[size];
94
+ size += sizeof(int16_t);
95
+ scanner->last_column = *(int16_t *)&buffer[size];
96
+ size += sizeof(int16_t);
97
+
98
+ while (size < length) {
99
+ array_push(&scanner->indents, *(int16_t *)&buffer[size]);
100
+ size += sizeof(int16_t);
101
+ }
102
+
103
+ assert(size == length);
104
+ }
105
+
106
+ static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
107
+
108
+ static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
109
+
110
+ static bool scan_string_content(TSLexer *lexer, bool is_multiline, bool has_interpolation) {
111
+ unsigned closing_quote_count = 0;
112
+ for (;;) {
113
+ if (lexer->lookahead == '"') {
114
+ advance(lexer);
115
+ closing_quote_count++;
116
+ if (!is_multiline) {
117
+ lexer->result_symbol = has_interpolation ? INTERPOLATED_STRING_END : SIMPLE_STRING;
118
+ return true;
119
+ }
120
+ if (closing_quote_count >= 3 && lexer->lookahead != '"') {
121
+ lexer->result_symbol = has_interpolation ? INTERPOLATED_MULTILINE_STRING_END : SIMPLE_MULTILINE_STRING;
122
+ return true;
123
+ }
124
+ } else if (lexer->lookahead == '$') {
125
+ if (is_multiline && has_interpolation) {
126
+ lexer->result_symbol = INTERPOLATED_MULTILINE_STRING_MIDDLE;
127
+ return true;
128
+ }
129
+ if (has_interpolation) {
130
+ lexer->result_symbol = INTERPOLATED_STRING_MIDDLE;
131
+ return true;
132
+ }
133
+ advance(lexer);
134
+ } else {
135
+ closing_quote_count = 0;
136
+ if (lexer->lookahead == '\\') {
137
+ advance(lexer);
138
+ if (!lexer->eof(lexer)) {
139
+ advance(lexer);
140
+ }
141
+ } else if (lexer->lookahead == '\n') {
142
+ if (is_multiline) {
143
+ advance(lexer);
144
+ } else {
145
+ return false;
146
+ }
147
+ } else if (lexer->eof(lexer)) {
148
+ return false;
149
+ } else {
150
+ advance(lexer);
151
+ }
152
+ }
153
+ }
154
+ }
155
+
156
+ static bool detect_comment_start(TSLexer *lexer) {
157
+ lexer->mark_end(lexer);
158
+ // Comments should not affect indentation
159
+ if (lexer->lookahead == '/') {
160
+ advance(lexer);
161
+ if (lexer->lookahead == '/' || lexer -> lookahead == '*') {
162
+ return true;
163
+ }
164
+ }
165
+ return false;
166
+ }
167
+
168
+ static bool scan_word(TSLexer *lexer, const char* const word) {
169
+ for (uint8_t i = 0; word[i] != '\0'; i++) {
170
+ if (lexer->lookahead != word[i]) {
171
+ return false;
172
+ }
173
+ advance(lexer);
174
+ }
175
+ return !iswalnum(lexer->lookahead);
176
+ }
177
+
178
+ static inline void debug_indents(Scanner *scanner) {
179
+ LOG(" indents(%d): ", scanner->indents.size);
180
+ for (unsigned i = 0; i < scanner->indents.size; i++) {
181
+ LOG("%d ", scanner->indents.contents[i]);
182
+ }
183
+ LOG("\n");
184
+ }
185
+
186
+ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer,
187
+ const bool *valid_symbols) {
188
+ Scanner *scanner = (Scanner *)payload;
189
+ int16_t prev = scanner->indents.size > 0 ? *array_back(&scanner->indents) : -1;
190
+ int16_t newline_count = 0;
191
+ int16_t indentation_size = 0;
192
+
193
+ while (iswspace(lexer->lookahead)) {
194
+ if (lexer->lookahead == '\n') {
195
+ newline_count++;
196
+ indentation_size = 0;
197
+ }
198
+ else {
199
+ indentation_size++;
200
+ }
201
+ skip(lexer);
202
+ }
203
+
204
+ // Before advancing the lexer, check if we can double outdent
205
+ if (
206
+ valid_symbols[OUTDENT] &&
207
+ (
208
+ lexer->lookahead == 0 ||
209
+ (
210
+ prev != -1 &&
211
+ (
212
+ lexer->lookahead == ')' ||
213
+ lexer->lookahead == ']' ||
214
+ lexer->lookahead == '}'
215
+ )
216
+ ) ||
217
+ (
218
+ scanner->last_indentation_size != -1 &&
219
+ prev != -1 &&
220
+ scanner->last_indentation_size < prev
221
+ )
222
+ )
223
+ ) {
224
+ if (scanner->indents.size > 0) {
225
+ array_pop(&scanner->indents);
226
+ }
227
+ LOG(" pop\n");
228
+ LOG(" OUTDENT\n");
229
+ lexer->result_symbol = OUTDENT;
230
+ return true;
231
+ }
232
+ scanner->last_indentation_size = -1;
233
+
234
+ if (
235
+ valid_symbols[INDENT] &&
236
+ newline_count > 0 &&
237
+ (
238
+ scanner->indents.size == 0 ||
239
+ indentation_size > *array_back(&scanner->indents)
240
+ )
241
+ ) {
242
+ if (detect_comment_start(lexer)) {
243
+ return false;
244
+ }
245
+ array_push(&scanner->indents, indentation_size);
246
+ lexer->result_symbol = INDENT;
247
+ LOG(" INDENT\n");
248
+ return true;
249
+ }
250
+
251
+ // This saves the indentation_size and newline_count so it can be used
252
+ // in subsequent calls for multiple outdent or autosemicolon.
253
+ if (valid_symbols[OUTDENT] &&
254
+ (lexer->lookahead == 0 ||
255
+ (
256
+ newline_count > 0 &&
257
+ prev != -1 &&
258
+ indentation_size < prev
259
+ )
260
+ )
261
+ ) {
262
+ if (scanner->indents.size > 0) {
263
+ array_pop(&scanner->indents);
264
+ }
265
+ LOG(" pop\n");
266
+ LOG(" OUTDENT\n");
267
+ lexer->result_symbol = OUTDENT;
268
+ lexer->mark_end(lexer);
269
+ if (detect_comment_start(lexer)) {
270
+ return false;
271
+ }
272
+ scanner->last_indentation_size = indentation_size;
273
+ scanner->last_newline_count = newline_count;
274
+ if (lexer->eof(lexer)) {
275
+ scanner->last_column = -1;
276
+ } else {
277
+ scanner->last_column = (int16_t)lexer->get_column(lexer);
278
+ }
279
+ return true;
280
+ }
281
+
282
+ // Recover newline_count from the outdent reset
283
+ bool is_eof = lexer->eof(lexer);
284
+ if (
285
+ (
286
+ scanner->last_newline_count > 0 &&
287
+ (is_eof && scanner->last_column == -1)
288
+ ) ||
289
+ (!is_eof && lexer->get_column(lexer) == (uint32_t)scanner->last_column)
290
+ ) {
291
+ newline_count += scanner->last_newline_count;
292
+ }
293
+ scanner->last_newline_count = 0;
294
+
295
+ if (valid_symbols[AUTOMATIC_SEMICOLON] && newline_count > 0) {
296
+ // AUTOMATIC_SEMICOLON should not be issued in the middle of expressions
297
+ // Thus, we exit this branch when encountering comments, else/catch clauses, etc.
298
+
299
+ lexer->mark_end(lexer);
300
+ lexer->result_symbol = AUTOMATIC_SEMICOLON;
301
+
302
+ // Probably, a multi-line field expression, e.g.
303
+ // a
304
+ // .b
305
+ // .c
306
+ if (lexer->lookahead == '.') {
307
+ return false;
308
+ }
309
+
310
+ // Single-line and multi-line comments
311
+ if (lexer->lookahead == '/') {
312
+ advance(lexer);
313
+ if (lexer->lookahead == '/') {
314
+ return false;
315
+ }
316
+ if (lexer->lookahead == '*') {
317
+ advance(lexer);
318
+ while (!lexer->eof(lexer)) {
319
+ if (lexer->lookahead == '*') {
320
+ advance(lexer);
321
+ if (lexer->lookahead == '/') {
322
+ advance(lexer);
323
+ break;
324
+ }
325
+ } else {
326
+ advance(lexer);
327
+ }
328
+ }
329
+ while (iswspace(lexer->lookahead)) {
330
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
331
+ return false;
332
+ }
333
+ skip(lexer);
334
+ }
335
+ // If some code is present at the same line after comment end,
336
+ // we should still produce AUTOMATIC_SEMICOLON, e.g. in
337
+ // val a = 1
338
+ // /* comment */ val b = 2
339
+ return true;
340
+ }
341
+ }
342
+
343
+ if (valid_symbols[ELSE]) {
344
+ return !scan_word(lexer, "else");
345
+ }
346
+
347
+ if (valid_symbols[CATCH]) {
348
+ if (scan_word(lexer, "catch")) {
349
+ return false;
350
+ }
351
+ }
352
+
353
+ if (valid_symbols[FINALLY]) {
354
+ if (scan_word(lexer, "finally")) {
355
+ return false;
356
+ }
357
+ }
358
+
359
+ if (valid_symbols[EXTENDS]) {
360
+ if (scan_word(lexer, "extends")) {
361
+ return false;
362
+ }
363
+ }
364
+
365
+ if (valid_symbols[WITH]) {
366
+ if (scan_word(lexer, "with")) {
367
+ return false;
368
+ }
369
+ }
370
+
371
+ if (valid_symbols[DERIVES]) {
372
+ if (scan_word(lexer, "derives")) {
373
+ return false;
374
+ }
375
+ }
376
+
377
+ if (newline_count > 1) {
378
+ return true;
379
+ }
380
+
381
+ return true;
382
+ }
383
+
384
+ while (iswspace(lexer->lookahead)) {
385
+ if (lexer->lookahead == '\n') {
386
+ newline_count++;
387
+ }
388
+ skip(lexer);
389
+ }
390
+
391
+ if (valid_symbols[SIMPLE_STRING] && lexer->lookahead == '"') {
392
+ advance(lexer);
393
+
394
+ bool is_multiline = false;
395
+ if (lexer->lookahead == '"') {
396
+ advance(lexer);
397
+ if (lexer->lookahead == '"') {
398
+ advance(lexer);
399
+ is_multiline = true;
400
+ } else {
401
+ lexer->result_symbol = SIMPLE_STRING;
402
+ return true;
403
+ }
404
+ }
405
+
406
+ return scan_string_content(lexer, is_multiline, false);
407
+ }
408
+
409
+ if (valid_symbols[INTERPOLATED_STRING_MIDDLE]) {
410
+ return scan_string_content(lexer, false, true);
411
+ }
412
+
413
+ if (valid_symbols[INTERPOLATED_MULTILINE_STRING_MIDDLE]) {
414
+ return scan_string_content(lexer, true, true);
415
+ }
416
+
417
+ return false;
418
+ }
419
+
420
+ //
@@ -0,0 +1,54 @@
1
+ #ifndef TREE_SITTER_ALLOC_H_
2
+ #define TREE_SITTER_ALLOC_H_
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include <stdbool.h>
9
+ #include <stdio.h>
10
+ #include <stdlib.h>
11
+
12
+ // Allow clients to override allocation functions
13
+ #ifdef TREE_SITTER_REUSE_ALLOCATOR
14
+
15
+ extern void *(*ts_current_malloc)(size_t size);
16
+ extern void *(*ts_current_calloc)(size_t count, size_t size);
17
+ extern void *(*ts_current_realloc)(void *ptr, size_t size);
18
+ extern void (*ts_current_free)(void *ptr);
19
+
20
+ #ifndef ts_malloc
21
+ #define ts_malloc ts_current_malloc
22
+ #endif
23
+ #ifndef ts_calloc
24
+ #define ts_calloc ts_current_calloc
25
+ #endif
26
+ #ifndef ts_realloc
27
+ #define ts_realloc ts_current_realloc
28
+ #endif
29
+ #ifndef ts_free
30
+ #define ts_free ts_current_free
31
+ #endif
32
+
33
+ #else
34
+
35
+ #ifndef ts_malloc
36
+ #define ts_malloc malloc
37
+ #endif
38
+ #ifndef ts_calloc
39
+ #define ts_calloc calloc
40
+ #endif
41
+ #ifndef ts_realloc
42
+ #define ts_realloc realloc
43
+ #endif
44
+ #ifndef ts_free
45
+ #define ts_free free
46
+ #endif
47
+
48
+ #endif
49
+
50
+ #ifdef __cplusplus
51
+ }
52
+ #endif
53
+
54
+ #endif // TREE_SITTER_ALLOC_H_