@acristoffers/tree-sitter-matlab 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,1094 @@
1
+ #include "tree_sitter/parser.h"
2
+
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+ #include <stdio.h>
6
+ #include <string.h>
7
+ #include <wctype.h>
8
+
9
+ // Custom punctuation check for WASM compatibility
10
+ // ispunct() is not in tree-sitter's WASM allowed functions list
11
+ // https://github.com/tree-sitter/tree-sitter/blob/master/lib/src/wasm/stdlib-symbols.txt
12
+ static inline bool is_punct_char(const uint32_t chr)
13
+ {
14
+ if (chr >= 0x80) {
15
+ return false;
16
+ }
17
+
18
+ return (chr >= 33 && chr <= 47) || // !"#$%&'()*+,-./
19
+ (chr >= 58 && chr <= 64) || // :;<=>?@
20
+ (chr >= 91 && chr <= 96) || // [\]^_`
21
+ (chr >= 123 && chr <= 126); // {|}~
22
+ }
23
+
24
+ enum TokenType {
25
+ COMMENT,
26
+ LINE_CONTINUATION,
27
+ COMMAND_NAME,
28
+ COMMAND_ARGUMENT,
29
+ SINGLE_QUOTE_STRING_START,
30
+ SINGLE_QUOTE_STRING_END,
31
+ DOUBLE_QUOTE_STRING_START,
32
+ DOUBLE_QUOTE_STRING_END,
33
+ FORMATTING_SEQUENCE,
34
+ ESCAPE_SEQUENCE,
35
+ STRING_CONTENT,
36
+ ENTRY_DELIMITER,
37
+ MULTIOUTPUT_VARIABLE_START,
38
+ IDENTIFIER,
39
+ CATCH_IDENTIFIER,
40
+ TRANSPOSE,
41
+ CTRANSPOSE,
42
+ ERROR_SENTINEL,
43
+ };
44
+
45
+ typedef struct
46
+ {
47
+ bool is_inside_command;
48
+ bool line_continuation;
49
+ bool is_shell_scape;
50
+ char string_delimiter;
51
+ } Scanner;
52
+
53
+ static const char* const keywords[] = {
54
+ "arguments", "break", "case", "catch", "classdef", "continue", "else", "elseif",
55
+ "end", "enumeration", "events", "for", "function", "global", "if", "methods",
56
+ "otherwise", "parfor", "persistent", "return", "spmd", "switch", "try", "while",
57
+ };
58
+ static const size_t keywords_size = sizeof(keywords) / sizeof(keywords[0]);
59
+
60
+ static inline void advance(TSLexer* lexer)
61
+ {
62
+ lexer->advance(lexer, false);
63
+ }
64
+
65
+ static inline void skip(TSLexer* lexer)
66
+ {
67
+ lexer->advance(lexer, true);
68
+ }
69
+
70
+ static inline bool consume_char(char chr, TSLexer* lexer)
71
+ {
72
+ if (lexer->lookahead != chr) {
73
+ return false;
74
+ }
75
+ advance(lexer);
76
+ return true;
77
+ }
78
+
79
+ static inline bool is_eol(const uint32_t chr)
80
+ {
81
+ return chr == '\n' || chr == '\r' || chr == ',' || chr == ';';
82
+ }
83
+
84
+ static inline bool iswspace_matlab(const uint32_t chr)
85
+ {
86
+ return iswspace(chr) && chr != '\n' && chr != '\r';
87
+ }
88
+
89
+ static inline bool is_identifier(const uint32_t chr, const bool start)
90
+ {
91
+ // isalpha or isdigit is SIGSEGVing os some UTF-8 chars, like U+10C6BD
92
+ // (0xF48C9ABD), a file with just those bytes shows the problem.
93
+ if (chr >= 0x80) {
94
+ return false;
95
+ }
96
+
97
+ const bool alpha = iswalpha(chr);
98
+ const bool numeric = !start && iswdigit(chr);
99
+ const bool special = chr == '_';
100
+
101
+ return alpha || numeric || special;
102
+ }
103
+
104
+ static inline void consume_identifier(TSLexer* lexer, char* buffer)
105
+ {
106
+ size_t size = 0;
107
+ if (is_identifier(lexer->lookahead, true)) {
108
+ buffer[size] = (char) lexer->lookahead;
109
+ advance(lexer);
110
+ while (is_identifier(lexer->lookahead, false)) {
111
+ if (size == 255) {
112
+ buffer[0] = 0;
113
+ return;
114
+ }
115
+ buffer[++size] = (char) lexer->lookahead;
116
+ advance(lexer);
117
+ }
118
+ return;
119
+ }
120
+ buffer[0] = 0;
121
+ }
122
+
123
+ static inline int skip_whitespaces(TSLexer* lexer)
124
+ {
125
+ // 0b001 -> something skipped
126
+ // 0b010 -> newline skipped
127
+ // 0b100 -> newline was at the end of skipped sequence
128
+ int skipped = 0;
129
+ while (!lexer->eof(lexer) && iswspace(lexer->lookahead)) {
130
+ skipped &= 0b011;
131
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
132
+ skipped |= 0b111;
133
+ } else {
134
+ skipped |= 0b001;
135
+ }
136
+ skip(lexer);
137
+ }
138
+ return skipped;
139
+ }
140
+
141
+ static inline int consume_whitespaces(TSLexer* lexer)
142
+ {
143
+ int skipped = 0;
144
+ while (iswspace(lexer->lookahead)) {
145
+ skipped &= 0b011;
146
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
147
+ skipped |= 0b111;
148
+ } else {
149
+ skipped |= 0b001;
150
+ }
151
+ advance(lexer);
152
+ }
153
+ return skipped;
154
+ }
155
+
156
+ static inline void consume_whitespaces_once(TSLexer* lexer)
157
+ {
158
+ while (iswspace(lexer->lookahead)) {
159
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
160
+ advance(lexer);
161
+ break;
162
+ }
163
+ advance(lexer);
164
+ }
165
+ }
166
+
167
+ void* tree_sitter_matlab_external_scanner_create()
168
+ {
169
+ Scanner* scanner = calloc(1, sizeof(Scanner));
170
+ return scanner;
171
+ }
172
+
173
+ void tree_sitter_matlab_external_scanner_destroy(void* payload)
174
+ {
175
+ if (payload != NULL) {
176
+ free(payload);
177
+ }
178
+ }
179
+
180
+ unsigned tree_sitter_matlab_external_scanner_serialize(void* payload, char* buffer)
181
+ {
182
+ Scanner* scanner = (Scanner*) payload;
183
+ buffer[0] = (char) scanner->is_inside_command;
184
+ buffer[1] = (char) scanner->line_continuation;
185
+ buffer[2] = (char) scanner->is_shell_scape;
186
+ buffer[3] = scanner->string_delimiter;
187
+ return 4;
188
+ }
189
+
190
+ void tree_sitter_matlab_external_scanner_deserialize(
191
+ void* payload,
192
+ const char* buffer,
193
+ unsigned length)
194
+ {
195
+ Scanner* scanner = (Scanner*) payload;
196
+ if (length == 4) {
197
+ scanner->is_inside_command = buffer[0];
198
+ scanner->line_continuation = buffer[1];
199
+ scanner->is_shell_scape = buffer[2];
200
+ scanner->string_delimiter = buffer[3];
201
+ }
202
+ }
203
+
204
+ static inline void consume_comment_line(TSLexer* lexer)
205
+ {
206
+ while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
207
+ advance(lexer);
208
+ }
209
+ }
210
+
211
+ // NOLINTNEXTLINE(*misc-no-recursion)
212
+ static bool scan_comment(TSLexer* lexer, bool entry_delimiter, bool ctranspose)
213
+ {
214
+ lexer->mark_end(lexer);
215
+
216
+ const bool percent = lexer->lookahead == '%';
217
+ const bool line_continuation = lexer->lookahead == '.' && consume_char('.', lexer)
218
+ && consume_char('.', lexer) && consume_char('.', lexer);
219
+ const bool block = percent && consume_char('%', lexer) && consume_char('{', lexer);
220
+
221
+ // Since we cannot look multiple chars ahead in the main function, this
222
+ // ended up being handled here. It allows the correct detection of numbers
223
+ // like .5 inside matrices/cells: [0 .5].
224
+ if (entry_delimiter && !percent && !line_continuation) {
225
+ lexer->result_symbol = ENTRY_DELIMITER;
226
+ return iswdigit(lexer->lookahead);
227
+ }
228
+ // We are inside a matrix/cell row and there is a line continuation, like this:
229
+ // a = { 1 ...
230
+ // 2 ...
231
+ // }
232
+
233
+ if (entry_delimiter && line_continuation) {
234
+ consume_whitespaces(lexer);
235
+ if (lexer->lookahead == '.') {
236
+ lexer->mark_end(lexer);
237
+ advance(lexer);
238
+ lexer->result_symbol = iswdigit(lexer->lookahead) ? ENTRY_DELIMITER : LINE_CONTINUATION;
239
+ } else if (iswdigit(lexer->lookahead) || lexer->lookahead == '\'' || lexer->lookahead == '"') {
240
+ lexer->result_symbol = ENTRY_DELIMITER;
241
+ } else {
242
+ lexer->result_symbol = LINE_CONTINUATION;
243
+ lexer->mark_end(lexer);
244
+ }
245
+ return true;
246
+ }
247
+
248
+ if (block) {
249
+ while (!lexer->eof(lexer) && iswspace_matlab(lexer->lookahead)) {
250
+ advance(lexer);
251
+ }
252
+
253
+ if (!consume_char('\n', lexer) && !consume_char('\r', lexer)) {
254
+ consume_comment_line(lexer);
255
+ lexer->result_symbol = COMMENT;
256
+ lexer->mark_end(lexer);
257
+ return true;
258
+ }
259
+
260
+ // Empty block comment
261
+ if (lexer->lookahead == '%' && consume_char('%', lexer) && consume_char('}', lexer)) {
262
+ lexer->result_symbol = COMMENT;
263
+ lexer->mark_end(lexer);
264
+ return true;
265
+ }
266
+
267
+ while (!lexer->eof(lexer)) {
268
+ consume_comment_line(lexer);
269
+ advance(lexer);
270
+ consume_whitespaces(lexer);
271
+
272
+ if (consume_char('%', lexer) && consume_char('}', lexer)) {
273
+ lexer->result_symbol = COMMENT;
274
+ lexer->mark_end(lexer);
275
+ return true;
276
+ }
277
+ }
278
+
279
+ return false;
280
+ }
281
+
282
+ if (percent || line_continuation) {
283
+ consume_comment_line(lexer);
284
+ lexer->mark_end(lexer);
285
+
286
+ if (!line_continuation) {
287
+ lexer->result_symbol = COMMENT;
288
+ advance(lexer);
289
+ } else {
290
+ lexer->result_symbol = LINE_CONTINUATION;
291
+ consume_whitespaces_once(lexer);
292
+ lexer->mark_end(lexer);
293
+ return true;
294
+ }
295
+
296
+ // Merges consecutive comments into one token, unless they are
297
+ // separated by a newline.
298
+ while (!lexer->eof(lexer) && (lexer->lookahead == ' ' || lexer->lookahead == '\t')) {
299
+ advance(lexer);
300
+ }
301
+
302
+ if (lexer->lookahead == '%') {
303
+ return scan_comment(lexer, false, false);
304
+ }
305
+
306
+ return true;
307
+ }
308
+
309
+ if (ctranspose && lexer->lookahead == '\'') {
310
+ advance(lexer);
311
+ lexer->mark_end(lexer);
312
+ lexer->result_symbol = CTRANSPOSE;
313
+ return true;
314
+ }
315
+
316
+ return false;
317
+ }
318
+
319
+ static bool scan_command(Scanner* scanner, TSLexer* lexer, const bool* valid_symbols)
320
+ {
321
+ // Special case: shell escape
322
+ if (lexer->lookahead == '!') {
323
+ advance(lexer);
324
+ while (iswspace_matlab(lexer->lookahead)) {
325
+ advance(lexer);
326
+ }
327
+ while (lexer->lookahead != ' ' && lexer->lookahead != '\n' && !lexer->eof(lexer)) {
328
+ advance(lexer);
329
+ }
330
+ lexer->result_symbol = COMMAND_NAME;
331
+ lexer->mark_end(lexer);
332
+ while (iswspace_matlab(lexer->lookahead)) {
333
+ advance(lexer);
334
+ }
335
+ scanner->is_inside_command = lexer->lookahead != '\n';
336
+ scanner->is_shell_scape = scanner->is_inside_command;
337
+ return true;
338
+ }
339
+
340
+ if (!is_identifier(lexer->lookahead, true)) {
341
+ return false;
342
+ }
343
+
344
+ char buffer[256] = {0};
345
+ consume_identifier(lexer, buffer);
346
+ lexer->mark_end(lexer);
347
+ const char* allowed_commands[] = {"methods", "arguments", "enumeration", "events"};
348
+ if (buffer[0] != 0) {
349
+ if (lexer->lookahead == '.') {
350
+ // Since it is not followed by a space, it cannot be a command.
351
+ if ((strcmp("get", buffer) == 0 || strcmp("set", buffer) == 0)) {
352
+ return false;
353
+ }
354
+ // so it is ok to consume to identify a line continuation
355
+ // NOLINTNEXTLINE(*misc-redundant-expression)
356
+ if (consume_char('.', lexer) && consume_char('.', lexer) && consume_char('.', lexer)) {
357
+ // If it is a keyword, yield to the internal scanner
358
+ for (size_t i = 0; i < keywords_size; i++) {
359
+ if (strcmp(keywords[i], buffer) == 0) {
360
+ return false;
361
+ }
362
+ }
363
+ }
364
+ lexer->result_symbol = IDENTIFIER;
365
+ return true;
366
+ }
367
+ // The following keywords are allowed as commands if they get 1 argument
368
+ for (unsigned i = 0; i < sizeof(allowed_commands) / sizeof(allowed_commands[0]); i++) {
369
+ if (strcmp(allowed_commands[i], buffer) == 0) {
370
+ goto check_command_for_argument;
371
+ }
372
+ }
373
+ for (unsigned i = 0; i < keywords_size; i++) {
374
+ if (strcmp(keywords[i], buffer) == 0) {
375
+ return false;
376
+ }
377
+ }
378
+ }
379
+ goto skip_command_check;
380
+
381
+ check_command_for_argument:
382
+ // If this is a keyword-command, check if it has an argument.
383
+ // If it has no arguments, this is a keyword, not a command.
384
+ lexer->result_symbol = COMMAND_NAME;
385
+ while (!lexer->eof(lexer) && iswspace_matlab(lexer->lookahead)) {
386
+ advance(lexer);
387
+ }
388
+ if (is_identifier(lexer->lookahead, true)) {
389
+ scanner->is_inside_command = true;
390
+ return true;
391
+ }
392
+ return false;
393
+
394
+ skip_command_check:
395
+
396
+ // First case: found an end-of-line already, so this is a command for sure.
397
+ // example:
398
+ // pwd
399
+ // pwd;
400
+ // pwd,
401
+ if (is_eol(lexer->lookahead)) {
402
+ lexer->result_symbol = valid_symbols[CATCH_IDENTIFIER] ? CATCH_IDENTIFIER : COMMAND_NAME;
403
+ return true;
404
+ }
405
+
406
+ // If it's not followed by a space, it may be something else, like A' for
407
+ // example. Or A+2.
408
+ if (lexer->lookahead != ' ') {
409
+ lexer->result_symbol = IDENTIFIER;
410
+ return true;
411
+ }
412
+
413
+ // If followed by a line continuation, look after it
414
+ const int skipped = consume_whitespaces(lexer);
415
+ if (skipped & 4) { // Command followed by spaces then newline
416
+ scanner->is_inside_command = false;
417
+ lexer->result_symbol = COMMAND_NAME;
418
+ return true;
419
+ }
420
+ if (lexer->lookahead == '.' && consume_char('.', lexer) && consume_char('.', lexer)
421
+ && consume_char('.', lexer)) {
422
+ lexer->result_symbol = IDENTIFIER;
423
+ return true;
424
+ }
425
+
426
+ // If it is followed by a space, it doesn't mean it's a command yet.
427
+ // It could be A + 2 or A = 2. Let's check what is the first char after
428
+ // all whitespaces. We mark it already as this is the right place, and we
429
+ // only need to make sure this is a command and not something else from
430
+ // this point on.
431
+ lexer->result_symbol = COMMAND_NAME;
432
+ while (!lexer->eof(lexer) && iswspace_matlab(lexer->lookahead)) {
433
+ advance(lexer);
434
+ }
435
+
436
+ // Check for end-of-line again, since it may be that the user just put a
437
+ // space at the end, like `pwd ;`
438
+ if (is_eol(lexer->lookahead)) {
439
+ scanner->is_inside_command = true;
440
+ return true;
441
+ }
442
+
443
+ // The first char of the first argument cannot be /=()/
444
+ if (lexer->lookahead == '=' || lexer->lookahead == '(' || lexer->lookahead == ')') {
445
+ lexer->result_symbol = IDENTIFIER;
446
+ return true;
447
+ }
448
+
449
+ // If it is a single quote, it is a command.
450
+ if (lexer->lookahead == '\'') {
451
+ scanner->is_inside_command = true;
452
+ return true;
453
+ }
454
+
455
+ // If it is an identifier char, then it's a command
456
+ if (is_identifier(lexer->lookahead, false)) {
457
+ scanner->is_inside_command = true;
458
+ return true;
459
+ }
460
+
461
+ // If it is a char greater than 0xC0, then assume it's a valid UTF-8
462
+ // char, and that this is a command.
463
+ if (lexer->lookahead >= 0xC0) {
464
+ scanner->is_inside_command = true;
465
+ return true;
466
+ }
467
+
468
+ // Let's now consider punctuation marks.
469
+ if (is_punct_char(lexer->lookahead)) {
470
+ // In this case, we advance and look at what comes next too.
471
+ const uint32_t first = lexer->lookahead;
472
+ advance(lexer);
473
+ const uint32_t second = lexer->lookahead;
474
+
475
+ // If it's the end-of-line, then it's a command.
476
+ if (is_eol(second)) {
477
+ scanner->is_inside_command = true;
478
+ return true;
479
+ }
480
+
481
+ if (iswspace_matlab(second)) {
482
+ // If it is a space, then it depends on what we have, since
483
+ // `disp + ;` is a valid command but `disp + 2;` isn't.
484
+ const char operators[] = {
485
+ '!',
486
+ '&',
487
+ '*',
488
+ '+',
489
+ '-',
490
+ '/',
491
+ '<',
492
+ '>',
493
+ '@',
494
+ '\\',
495
+ '^',
496
+ '|',
497
+ };
498
+ bool is_invalid = false;
499
+ for (size_t i = 0; i < sizeof(operators); i++) {
500
+ if (first == (uint32_t) operators[i]) {
501
+ is_invalid = true;
502
+ break;
503
+ }
504
+ }
505
+ // If it is an operator, this can only be a command if there
506
+ // are no further arguments.
507
+ if (is_invalid) {
508
+ advance(lexer);
509
+ while (iswspace_matlab(lexer->lookahead)) {
510
+ advance(lexer);
511
+ }
512
+ scanner->is_inside_command = is_eol(lexer->lookahead);
513
+ lexer->result_symbol = scanner->is_inside_command ? COMMAND_NAME : IDENTIFIER;
514
+ return true;
515
+ }
516
+
517
+ // If it's not an operator, then this is a command.
518
+ scanner->is_inside_command = true;
519
+ return true;
520
+ }
521
+
522
+ // Now we check for the rest of the operators.
523
+ // Since they have 2 digits, it matters if the next is a space.
524
+ advance(lexer);
525
+
526
+ if (lexer->lookahead != ' ') {
527
+ scanner->is_inside_command = true;
528
+ return true;
529
+ }
530
+
531
+ const char operators[][2] = {
532
+ {'&', '&'},
533
+ {'|', '|'},
534
+ {'=', '='},
535
+ {'~', '='},
536
+ {'<', '='},
537
+ {'>', '='},
538
+ {'.', '+'},
539
+ {'.', '-'},
540
+ {'.', '*'},
541
+ {'.', '/'},
542
+ {'.', '\\'},
543
+ {'.', '^'},
544
+ };
545
+
546
+ for (int i = 0; i < 12; i++) {
547
+ if ((uint32_t) operators[i][0] == first && (uint32_t) operators[i][1] == second) {
548
+ lexer->result_symbol = IDENTIFIER;
549
+ return true;
550
+ }
551
+ }
552
+
553
+ scanner->is_inside_command = true;
554
+ return true;
555
+ }
556
+
557
+ return false;
558
+ }
559
+
560
+ static bool scan_command_argument(Scanner* scanner, TSLexer* lexer)
561
+ {
562
+ // If this is a shell escape command, we just break arguments in spaces
563
+ // since we don't know what shell it is.
564
+ if (scanner->is_shell_scape) {
565
+ if (lexer->eof(lexer)) {
566
+ return false;
567
+ }
568
+
569
+ while (lexer->lookahead != ' ' && lexer->lookahead != '\n' && !lexer->eof(lexer)) {
570
+ advance(lexer);
571
+ }
572
+ lexer->result_symbol = COMMAND_ARGUMENT;
573
+ lexer->mark_end(lexer);
574
+ while (iswspace_matlab(lexer->lookahead)) {
575
+ advance(lexer);
576
+ }
577
+ if (lexer->lookahead == '\n') {
578
+ scanner->is_inside_command = false;
579
+ scanner->is_shell_scape = false;
580
+ }
581
+ return true;
582
+ }
583
+
584
+ // Avoids infinite loop when the argument is right before the eof.
585
+ if (lexer->eof(lexer)) {
586
+ return false;
587
+ }
588
+
589
+ bool quote = false;
590
+ int32_t parens = 0;
591
+ bool consumed = false;
592
+
593
+ while (!lexer->eof(lexer)) {
594
+ // No matter what, found new line
595
+ const bool cond1 = lexer->lookahead == '\n' || lexer->lookahead == '\r';
596
+ // No quotes, no parens, found $._end_of_line or space
597
+ const bool cond2 = !quote && parens == 0
598
+ && (is_eol(lexer->lookahead) || iswspace_matlab(lexer->lookahead));
599
+ // Inside parens, no quotes, found ;
600
+ const bool cond3 = !quote && parens != 0 && lexer->lookahead == ';';
601
+ if (cond1 || cond2 || cond3) {
602
+ lexer->result_symbol = COMMAND_ARGUMENT;
603
+ lexer->mark_end(lexer);
604
+
605
+ while (iswspace_matlab(lexer->lookahead)) {
606
+ advance(lexer);
607
+ }
608
+
609
+ if (is_eol(lexer->lookahead) || cond1) {
610
+ scanner->line_continuation = false;
611
+ scanner->is_inside_command = false;
612
+ }
613
+
614
+ return true;
615
+ }
616
+
617
+ // Line comment, finish.
618
+ if ((!quote || (quote && parens != 0)) && lexer->lookahead == '%') {
619
+ scanner->is_inside_command = false;
620
+ if (consumed) {
621
+ lexer->result_symbol = COMMAND_ARGUMENT;
622
+ lexer->mark_end(lexer);
623
+ return true;
624
+ }
625
+ return scan_comment(lexer, false, false);
626
+ }
627
+
628
+ // Line continuation
629
+ if ((!quote || (quote && parens != 0)) && lexer->lookahead == '.') {
630
+ lexer->result_symbol = COMMAND_ARGUMENT;
631
+ lexer->mark_end(lexer);
632
+ advance(lexer);
633
+ if (lexer->lookahead == '.') {
634
+ advance(lexer);
635
+ if (lexer->lookahead == '.') {
636
+ if (consumed) {
637
+ scanner->line_continuation = true;
638
+ } else {
639
+ consume_comment_line(lexer);
640
+ lexer->result_symbol = LINE_CONTINUATION;
641
+ lexer->mark_end(lexer);
642
+ }
643
+ return true;
644
+ }
645
+ consumed = true;
646
+ continue;
647
+ }
648
+ consumed = true;
649
+ continue;
650
+ }
651
+
652
+ if ((lexer->lookahead == '(' || lexer->lookahead == '[' || lexer->lookahead == '{')
653
+ && (!quote || (quote && parens != 0))) {
654
+ parens++;
655
+ }
656
+
657
+ if ((lexer->lookahead == ')' || lexer->lookahead == ']' || lexer->lookahead == '}')
658
+ && (!quote || (quote && parens != 0))) {
659
+ parens--;
660
+ }
661
+
662
+ if (lexer->lookahead == '\'') {
663
+ quote = !quote;
664
+ }
665
+
666
+ advance(lexer);
667
+ consumed = true;
668
+ }
669
+
670
+ // Mark as argument so the scanner doesnt get called again in an infinite
671
+ // loop.
672
+ if (lexer->eof(lexer)) {
673
+ lexer->result_symbol = COMMAND_ARGUMENT;
674
+ lexer->mark_end(lexer);
675
+ return true;
676
+ }
677
+
678
+ return false;
679
+ }
680
+
681
+ static bool scan_string_open(Scanner* scanner, TSLexer* lexer)
682
+ {
683
+ switch (lexer->lookahead) {
684
+ case '"':
685
+ scanner->string_delimiter = '"';
686
+ advance(lexer);
687
+ lexer->result_symbol = DOUBLE_QUOTE_STRING_START;
688
+ lexer->mark_end(lexer);
689
+ return true;
690
+ case '\'':
691
+ scanner->string_delimiter = '\'';
692
+ advance(lexer);
693
+ lexer->result_symbol = SINGLE_QUOTE_STRING_START;
694
+ lexer->mark_end(lexer);
695
+ // A single quote string has to be ended in the same line.
696
+ while (!lexer->eof(lexer) && lexer->lookahead != '\n') {
697
+ if (lexer->lookahead == '\'') {
698
+ return true;
699
+ }
700
+ advance(lexer);
701
+ }
702
+ return false;
703
+ default:
704
+ return false;
705
+ }
706
+ }
707
+
708
+ static bool scan_string_close(Scanner* scanner, TSLexer* lexer)
709
+ {
710
+ if (lexer->lookahead == scanner->string_delimiter) {
711
+ advance(lexer);
712
+ if (lexer->lookahead == scanner->string_delimiter) {
713
+ advance(lexer);
714
+ lexer->result_symbol = STRING_CONTENT;
715
+ goto content;
716
+ }
717
+ lexer->result_symbol = scanner->string_delimiter == '"' ? DOUBLE_QUOTE_STRING_END
718
+ : SINGLE_QUOTE_STRING_END;
719
+ lexer->mark_end(lexer);
720
+ scanner->string_delimiter = 0;
721
+ return true;
722
+ }
723
+
724
+ // This means this string is not properly terminated.
725
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer)) {
726
+ scanner->string_delimiter = 0;
727
+ return false;
728
+ }
729
+
730
+ if (lexer->lookahead == '%') {
731
+ advance(lexer);
732
+
733
+ if (lexer->lookahead == '%') {
734
+ advance(lexer);
735
+ lexer->result_symbol = FORMATTING_SEQUENCE;
736
+ lexer->mark_end(lexer);
737
+ return true;
738
+ }
739
+
740
+ const char* valid_tokens = "1234567890.-+ #btcdeEfgGosuxX";
741
+ const char* end_tokens = "cdeEfgGosuxX";
742
+ while (!lexer->eof(lexer) && lexer->lookahead != '\n' && lexer->lookahead != '\r') {
743
+ bool is_valid = false;
744
+ for (size_t i = 0; i < strlen(valid_tokens); i++) {
745
+ if ((int32_t) valid_tokens[i] == lexer->lookahead) {
746
+ is_valid = true;
747
+ break;
748
+ }
749
+ }
750
+
751
+ if (!is_valid) {
752
+ lexer->result_symbol = STRING_CONTENT;
753
+ goto content;
754
+ }
755
+
756
+ for (int i = 0; i < 12; i++) {
757
+ if (end_tokens[i] == lexer->lookahead) {
758
+ advance(lexer);
759
+ lexer->result_symbol = FORMATTING_SEQUENCE;
760
+ lexer->mark_end(lexer);
761
+ return true;
762
+ }
763
+ }
764
+
765
+ advance(lexer);
766
+ }
767
+
768
+ scanner->string_delimiter = 0;
769
+ return false;
770
+ }
771
+
772
+ if (lexer->lookahead == '\\') {
773
+ advance(lexer);
774
+
775
+ if (lexer->lookahead == 'x') {
776
+ advance(lexer);
777
+ while (!lexer->eof(lexer)) {
778
+ const char* hexa_chars = "1234567890abcdefABCDEF";
779
+ bool is_valid = false;
780
+ for (int i = 0; i < 22; i++) {
781
+ if (hexa_chars[i] == lexer->lookahead) {
782
+ is_valid = true;
783
+ break;
784
+ }
785
+ }
786
+
787
+ if (!is_valid) {
788
+ lexer->result_symbol = ESCAPE_SEQUENCE;
789
+ lexer->mark_end(lexer);
790
+ return true;
791
+ }
792
+
793
+ advance(lexer);
794
+ }
795
+ }
796
+
797
+ if (lexer->lookahead >= '0' && lexer->lookahead <= '7') {
798
+ while (lexer->lookahead >= '0' && lexer->lookahead <= '7' && !lexer->eof(lexer)) {
799
+ advance(lexer);
800
+ }
801
+
802
+ lexer->result_symbol = ESCAPE_SEQUENCE;
803
+ lexer->mark_end(lexer);
804
+ return true;
805
+ }
806
+
807
+ const char* escapes = "abfnrtv\\";
808
+ bool is_valid = false;
809
+ for (int i = 0; i < 8; i++) {
810
+ if (escapes[i] == lexer->lookahead) {
811
+ is_valid = true;
812
+ break;
813
+ }
814
+ }
815
+
816
+ if (is_valid) {
817
+ advance(lexer);
818
+ lexer->result_symbol = ESCAPE_SEQUENCE;
819
+ lexer->mark_end(lexer);
820
+ return true;
821
+ }
822
+ }
823
+
824
+ content:
825
+ while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
826
+ // In MATLAB '' and "" are valid inside their own kind: 'It''s ok' "He said ""it's ok"""
827
+ if (lexer->lookahead == scanner->string_delimiter) {
828
+ lexer->result_symbol = STRING_CONTENT;
829
+ lexer->mark_end(lexer);
830
+ advance(lexer);
831
+ if (lexer->lookahead != scanner->string_delimiter) {
832
+ return true;
833
+ }
834
+ advance(lexer);
835
+ continue;
836
+ }
837
+
838
+ // The scanner will be called again, and this time we will match in the if
839
+ // before this while.
840
+ if (lexer->lookahead == '%' || lexer->lookahead == '\\') {
841
+ lexer->result_symbol = STRING_CONTENT;
842
+ lexer->mark_end(lexer);
843
+ advance(lexer);
844
+ if (lexer->lookahead == scanner->string_delimiter || iswspace_matlab(lexer->lookahead)) {
845
+ goto content;
846
+ }
847
+ return true;
848
+ }
849
+
850
+ advance(lexer);
851
+ }
852
+
853
+ // Mark end of content here and end of string on next call. This is an
854
+ // unterminated string and it's better to wrongly finish it here, otherwise
855
+ // the error will appear god knows how many lines after this and it will be
856
+ // hard for the user to understand what went wrong.
857
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer)) {
858
+ lexer->result_symbol = STRING_CONTENT;
859
+ lexer->mark_end(lexer);
860
+ return true;
861
+ }
862
+
863
+ scanner->string_delimiter = 0;
864
+ return false;
865
+ }
866
+
867
+ static inline bool scan_multioutput_var_start(TSLexer* lexer)
868
+ {
869
+ advance(lexer);
870
+ lexer->result_symbol = MULTIOUTPUT_VARIABLE_START;
871
+ lexer->mark_end(lexer);
872
+
873
+ // We can have arrays inside function calls inside the multi-output variable, so we have to keep
874
+ // track.
875
+ unsigned sb_count = 0;
876
+
877
+ while (!lexer->eof(lexer)) {
878
+ // NOLINTNEXTLINE(*misc-redundant-expression)
879
+ if (consume_char('.', lexer) && consume_char('.', lexer) && consume_char('.', lexer)) {
880
+ consume_comment_line(lexer);
881
+ advance(lexer);
882
+ }
883
+
884
+ if (lexer->lookahead == '[') {
885
+ sb_count++;
886
+ advance(lexer);
887
+ }
888
+
889
+ if (lexer->lookahead != ']') {
890
+ advance(lexer);
891
+ } else if (sb_count > 0) {
892
+ sb_count--;
893
+ advance(lexer);
894
+ } else {
895
+ break;
896
+ }
897
+ }
898
+
899
+ if (lexer->lookahead != ']') {
900
+ return false;
901
+ }
902
+
903
+ advance(lexer);
904
+
905
+ while (!lexer->eof(lexer)) {
906
+ // NOLINTNEXTLINE(*misc-redundant-expression)
907
+ if (consume_char('.', lexer) && consume_char('.', lexer) && consume_char('.', lexer)) {
908
+ consume_comment_line(lexer);
909
+ advance(lexer);
910
+ } else if (iswspace_matlab(lexer->lookahead)) {
911
+ advance(lexer);
912
+ } else {
913
+ break;
914
+ }
915
+ }
916
+
917
+ if (lexer->lookahead == '=') {
918
+ advance(lexer);
919
+ if (lexer->lookahead != '=') {
920
+ return true;
921
+ }
922
+ }
923
+
924
+ return false;
925
+ }
926
+
927
+ static bool scan_identifier(TSLexer* lexer);
928
+ static bool scan_entry_delimiter(TSLexer* lexer, int skipped)
929
+ {
930
+ lexer->mark_end(lexer);
931
+ lexer->result_symbol = ENTRY_DELIMITER;
932
+
933
+ if (skipped & 2) {
934
+ return false;
935
+ }
936
+
937
+ if (lexer->lookahead == ',') {
938
+ advance(lexer);
939
+ lexer->mark_end(lexer);
940
+ lexer->result_symbol = ENTRY_DELIMITER;
941
+ return true;
942
+ }
943
+
944
+ if (lexer->lookahead == '.') {
945
+ advance(lexer);
946
+ advance(lexer);
947
+ return iswdigit(lexer->lookahead);
948
+ }
949
+
950
+ if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'') {
951
+ return skipped != 0;
952
+ }
953
+
954
+ if (lexer->lookahead == '[') {
955
+ return true;
956
+ }
957
+
958
+ // These chars mean we cannot end the cell here, as the expression will
959
+ // surely continue OR we need to just leave the char there and the internal
960
+ // parser will do the rest.
961
+ const char no_end[] = {']', '}', '&', '|', '=', '<', '>', '*', '/', '\\', '^', ';', ':'};
962
+ for (size_t i = 0; i < sizeof(no_end); i++) {
963
+ if ((int32_t) no_end[i] == lexer->lookahead) {
964
+ return false;
965
+ }
966
+ }
967
+
968
+ if (lexer->lookahead == '~') {
969
+ advance(lexer);
970
+ return lexer->lookahead != '=';
971
+ }
972
+
973
+ const char maybe_end[] = {'+', '-'};
974
+ for (size_t i = 0; i < sizeof(maybe_end); i++) {
975
+ if ((int32_t) maybe_end[i] == lexer->lookahead) {
976
+ advance(lexer);
977
+ if (lexer->lookahead == ' ') {
978
+ return false;
979
+ }
980
+ return skipped != 0;
981
+ }
982
+ }
983
+
984
+ if (skipped != 0) {
985
+ return true;
986
+ }
987
+
988
+ if (is_identifier(lexer->lookahead, true)) {
989
+ return scan_identifier(lexer);
990
+ }
991
+
992
+ return false;
993
+ }
994
+
995
+ static bool scan_identifier(TSLexer* lexer)
996
+ {
997
+ char buffer[256] = {0};
998
+ consume_identifier(lexer, buffer);
999
+ if (buffer[0] != 0) {
1000
+ if (lexer->lookahead == '.') {
1001
+ if ((strcmp("get", buffer) == 0 || strcmp("set", buffer) == 0)) {
1002
+ return false;
1003
+ }
1004
+ lexer->result_symbol = IDENTIFIER;
1005
+ lexer->mark_end(lexer);
1006
+ return true;
1007
+ }
1008
+ for (size_t i = 0; i < keywords_size; i++) {
1009
+ if (strcmp(keywords[i], buffer) == 0) {
1010
+ return false;
1011
+ }
1012
+ }
1013
+ lexer->result_symbol = IDENTIFIER;
1014
+ lexer->mark_end(lexer);
1015
+ return true;
1016
+ }
1017
+ return false;
1018
+ }
1019
+
1020
+ static bool scan_transpose(TSLexer* lexer)
1021
+ {
1022
+ if (lexer->lookahead == '\'') {
1023
+ advance(lexer);
1024
+ lexer->mark_end(lexer);
1025
+ lexer->result_symbol = TRANSPOSE;
1026
+ return true;
1027
+ }
1028
+ if (lexer->lookahead == '.' && consume_char('\'', lexer)) {
1029
+ advance(lexer);
1030
+ lexer->mark_end(lexer);
1031
+ lexer->result_symbol = CTRANSPOSE;
1032
+ return true;
1033
+ }
1034
+ return false;
1035
+ }
1036
+
1037
+ bool tree_sitter_matlab_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols)
1038
+ {
1039
+ Scanner* scanner = (Scanner*) payload;
1040
+ if (scanner->string_delimiter == 0) {
1041
+ int skipped = skip_whitespaces(lexer);
1042
+
1043
+ if ((scanner->line_continuation || !scanner->is_inside_command) && valid_symbols[COMMENT]
1044
+ && (lexer->lookahead == '%' || ((skipped & 2) == 0 && lexer->lookahead == '.'))) {
1045
+ return scan_comment(lexer, valid_symbols[ENTRY_DELIMITER], valid_symbols[CTRANSPOSE]);
1046
+ }
1047
+
1048
+ if (!scanner->is_inside_command) {
1049
+ if (skipped == 0 && valid_symbols[TRANSPOSE]) {
1050
+ if (scan_transpose(lexer)) {
1051
+ return true;
1052
+ }
1053
+ }
1054
+
1055
+ if ((valid_symbols[SINGLE_QUOTE_STRING_START] && lexer->lookahead == '\'')
1056
+ || (valid_symbols[DOUBLE_QUOTE_STRING_START] && lexer->lookahead == '"')) {
1057
+ return scan_string_open(scanner, lexer);
1058
+ }
1059
+
1060
+ if (!scanner->line_continuation) {
1061
+ if (valid_symbols[MULTIOUTPUT_VARIABLE_START] && lexer->lookahead == '[') {
1062
+ return scan_multioutput_var_start(lexer);
1063
+ }
1064
+
1065
+ if (valid_symbols[ENTRY_DELIMITER]) {
1066
+ return scan_entry_delimiter(lexer, skipped);
1067
+ }
1068
+ }
1069
+
1070
+ if (valid_symbols[COMMAND_NAME]) {
1071
+ scanner->is_inside_command = false;
1072
+ scanner->is_shell_scape = false;
1073
+ return scan_command(scanner, lexer, valid_symbols);
1074
+ }
1075
+
1076
+ if (valid_symbols[IDENTIFIER] && (skipped & 2) == 0) {
1077
+ scanner->is_inside_command = false;
1078
+ scanner->is_shell_scape = false;
1079
+ return scan_identifier(lexer);
1080
+ }
1081
+ } else {
1082
+ if (valid_symbols[COMMAND_ARGUMENT]) {
1083
+ return scan_command_argument(scanner, lexer);
1084
+ }
1085
+ }
1086
+ } else {
1087
+ if (valid_symbols[DOUBLE_QUOTE_STRING_END] || valid_symbols[SINGLE_QUOTE_STRING_END]
1088
+ || valid_symbols[FORMATTING_SEQUENCE]) {
1089
+ return scan_string_close(scanner, lexer);
1090
+ }
1091
+ }
1092
+
1093
+ return false;
1094
+ }