omextra 0.0.0.dev467__py3-none-any.whl → 0.0.0.dev487__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1888 @@
1
+ # ruff: noqa: UP006 UP007 UP043 UP045
2
+ # @omlish-lite
3
+ import dataclasses as dc
4
+ import enum
5
+ import typing as ta
6
+
7
+ from omlish.lite.check import check
8
+
9
+ from .errors import EofYamlError
10
+ from .errors import YamlError
11
+ from .errors import YamlErrorOr
12
+ from .errors import yaml_error
13
+ from .tokens import YAML_RESERVED_TAG_KEYWORD_MAP
14
+ from .tokens import YamlIndicator
15
+ from .tokens import YamlPosition
16
+ from .tokens import YamlToken
17
+ from .tokens import YamlTokenMakers
18
+ from .tokens import YamlTokens
19
+ from .tokens import YamlTokenType
20
+ from .tokens import new_yaml_token
21
+
22
+
23
+ ##
24
+
25
+
26
+ @dc.dataclass()
27
+ class InvalidTokenYamlError(YamlError):
28
+ token: YamlToken
29
+
30
+ @property
31
+ def message(self) -> str:
32
+ return check.not_none(self.token.error).message
33
+
34
+
35
+ def err_invalid_token(tk: YamlToken) -> InvalidTokenYamlError:
36
+ return InvalidTokenYamlError(
37
+ token=tk,
38
+ )
39
+
40
+
41
+ ##
42
+
43
+
44
+ # Context at scanning
45
+ @dc.dataclass()
46
+ class YamlScanningContext:
47
+ idx: int = 0
48
+ size: int = 0
49
+ not_space_char_pos: int = 0
50
+ not_space_org_char_pos: int = 0
51
+ src: str = ''
52
+ buf: str = ''
53
+ obuf: str = ''
54
+ tokens: YamlTokens = dc.field(default_factory=YamlTokens)
55
+ mstate: ta.Optional['YamlMultiLineState'] = None
56
+
57
+ def clear(self) -> None:
58
+ self.reset_buffer()
59
+ self.mstate = None
60
+
61
+ def reset(self, src: str) -> None:
62
+ self.idx = 0
63
+ self.size = len(src)
64
+ self.src = src
65
+ self.tokens = YamlTokens()
66
+ self.reset_buffer()
67
+ self.mstate = None
68
+
69
+ def reset_buffer(self) -> None:
70
+ self.buf = ''
71
+ self.obuf = ''
72
+ self.not_space_char_pos = 0
73
+ self.not_space_org_char_pos = 0
74
+
75
+ def break_multi_line(self) -> None:
76
+ self.mstate = None
77
+
78
+ def get_multi_line_state(self) -> ta.Optional['YamlMultiLineState']:
79
+ return self.mstate
80
+
81
+ def set_literal(self, last_delim_column: int, opt: str) -> None:
82
+ mstate = YamlMultiLineState(
83
+ is_literal=True,
84
+ opt=opt,
85
+ )
86
+ indent = first_line_indent_column_by_opt(opt)
87
+ if indent > 0:
88
+ mstate.first_line_indent_column = last_delim_column + indent
89
+ self.mstate = mstate
90
+
91
+ def set_folded(self, last_delim_column: int, opt: str) -> None:
92
+ mstate = YamlMultiLineState(
93
+ is_folded=True,
94
+ opt=opt,
95
+ )
96
+ indent = first_line_indent_column_by_opt(opt)
97
+ if indent > 0:
98
+ mstate.first_line_indent_column = last_delim_column + indent
99
+ self.mstate = mstate
100
+
101
+ def set_raw_folded(self, column: int) -> None:
102
+ mstate = YamlMultiLineState(
103
+ is_raw_folded=True,
104
+ )
105
+ mstate.update_indent_column(column)
106
+ self.mstate = mstate
107
+
108
+ def add_token(self, tk: ta.Optional[YamlToken]) -> None:
109
+ if tk is None:
110
+ return
111
+ self.tokens.append(tk) # FIXME: .add??
112
+
113
+ def add_buf(self, r: str) -> None:
114
+ if len(self.buf) == 0 and (r == ' ' or r == '\t'):
115
+ return
116
+ self.buf += r
117
+ if r != ' ' and r != '\t':
118
+ self.not_space_char_pos = len(self.buf)
119
+
120
+ def add_buf_with_tab(self, r: str) -> None:
121
+ if len(self.buf) == 0 and r == ' ':
122
+ return
123
+ self.buf += r
124
+ if r != ' ':
125
+ self.not_space_char_pos = len(self.buf)
126
+
127
+ def add_origin_buf(self, r: str) -> None:
128
+ self.obuf += r
129
+ if r != ' ' and r != '\t':
130
+ self.not_space_org_char_pos = len(self.obuf)
131
+
132
+ def remove_right_space_from_buf(self) -> None:
133
+ trimmed_buf = self.obuf[:self.not_space_org_char_pos]
134
+ buflen = len(trimmed_buf)
135
+ diff = len(self.obuf) - buflen
136
+ if diff > 0:
137
+ self.obuf = self.obuf[:buflen]
138
+ self.buf = self.buffered_src()
139
+
140
+ def is_eos(self) -> bool:
141
+ return len(self.src) - 1 <= self.idx
142
+
143
+ def is_next_eos(self) -> bool:
144
+ return len(self.src) <= self.idx + 1
145
+
146
+ def next(self) -> bool:
147
+ return self.idx < self.size
148
+
149
+ def source(self, s: int, e: int) -> str:
150
+ return self.src[s:e]
151
+
152
+ def previous_char(self) -> str:
153
+ if self.idx > 0:
154
+ return self.src[self.idx - 1]
155
+ return ''
156
+
157
+ def current_char(self) -> str:
158
+ if self.size > self.idx:
159
+ return self.src[self.idx]
160
+ return ''
161
+
162
+ def next_char(self) -> str:
163
+ if self.size > self.idx + 1:
164
+ return self.src[self.idx + 1]
165
+ return ''
166
+
167
+ def repeat_num(self, r: str) -> int:
168
+ cnt = 0
169
+ for i in range(self.idx, self.size):
170
+ if self.src[i] == r:
171
+ cnt += 1
172
+ else:
173
+ break
174
+ return cnt
175
+
176
+ def progress(self, num: int) -> None:
177
+ self.idx += num
178
+
179
+ def exists_buffer(self) -> bool:
180
+ return len(self.buffered_src()) != 0
181
+
182
+ def is_multi_line(self) -> bool:
183
+ return self.mstate is not None
184
+
185
+ def buffered_src(self) -> str:
186
+ src = self.buf[:self.not_space_char_pos]
187
+
188
+ if self.is_multi_line():
189
+ mstate = check.not_none(self.get_multi_line_state())
190
+
191
+ # remove end '\n' character and trailing empty lines.
192
+ # https://yaml.org/spec/1.2.2/#8112-block-chomping-indicator
193
+ if mstate.has_trim_all_end_newline_opt():
194
+ # If the '-' flag is specified, all trailing newline characters will be removed.
195
+ src = src.rstrip('\n')
196
+
197
+ elif not mstate.has_keep_all_end_newline_opt():
198
+ # Normally, all but one of the trailing newline characters are removed.
199
+ new_line_char_count = 0
200
+ for i in range(len(src) - 1, -1, -1):
201
+ if src[i] == '\n':
202
+ new_line_char_count += 1
203
+ continue
204
+ break
205
+
206
+ removed_new_line_char_count = new_line_char_count - 1
207
+ while removed_new_line_char_count > 0:
208
+ src = src.rstrip('\n')
209
+ removed_new_line_char_count -= 1
210
+
211
+ # If the text ends with a space character, remove all of them.
212
+ if mstate.has_trim_all_end_newline_opt():
213
+ src = src.rstrip(' ')
214
+
215
+ if src == '\n':
216
+ # If the content consists only of a newline, it can be considered as the document ending without any
217
+ # specified value, so it is treated as an empty string.
218
+ src = ''
219
+
220
+ if mstate.has_keep_all_end_newline_opt() and len(src) == 0:
221
+ src = '\n'
222
+
223
+ return src
224
+
225
+ def buffered_token(self, pos: YamlPosition) -> ta.Optional[YamlToken]:
226
+ if self.idx == 0:
227
+ return None
228
+
229
+ source = self.buffered_src()
230
+ if len(source) == 0:
231
+ self.buf = self.buf[:0] # clear value's buffer only.
232
+ return None
233
+
234
+ tk: ta.Optional[YamlToken]
235
+ if self.is_multi_line():
236
+ tk = YamlTokenMakers.new_string(source, self.obuf, pos)
237
+ else:
238
+ tk = new_yaml_token(source, self.obuf, pos)
239
+
240
+ self.set_token_type_by_prev_tag(tk)
241
+ self.reset_buffer()
242
+ return tk
243
+
244
+ def set_token_type_by_prev_tag(self, tk: ta.Optional[YamlToken]) -> None:
245
+ last_tk = self.last_token()
246
+ if last_tk is None:
247
+ return
248
+
249
+ if last_tk.type != YamlTokenType.TAG:
250
+ return
251
+
252
+ tag = last_tk.value
253
+ if tag not in YAML_RESERVED_TAG_KEYWORD_MAP:
254
+ check.not_none(tk).type = YamlTokenType.STRING
255
+
256
+ def last_token(self) -> ta.Optional[YamlToken]:
257
+ if len(self.tokens) != 0:
258
+ return self.tokens[len(self.tokens) - 1]
259
+
260
+ return None
261
+
262
+ @staticmethod
263
+ def new(src: str) -> 'YamlScanningContext':
264
+ ctx = YamlScanningContext()
265
+ ctx.reset(src)
266
+ return ctx
267
+
268
+
269
+ ##
270
+
271
+
272
+ @dc.dataclass()
273
+ class YamlMultiLineState:
274
+ opt: str = ''
275
+ first_line_indent_column: int = 0
276
+ prev_line_indent_column: int = 0
277
+ line_indent_column: int = 0
278
+ last_not_space_only_line_indent_column: int = 0
279
+ space_only_indent_column: int = 0
280
+ folded_new_line: bool = False
281
+ is_raw_folded: bool = False
282
+ is_literal: bool = False
283
+ is_folded: bool = False
284
+
285
+ def last_delim_column(self) -> int:
286
+ if self.first_line_indent_column == 0:
287
+ return 0
288
+ return self.first_line_indent_column - 1
289
+
290
+ def update_indent_column(self, column: int) -> None:
291
+ if self.first_line_indent_column == 0:
292
+ self.first_line_indent_column = column
293
+ if self.line_indent_column == 0:
294
+ self.line_indent_column = column
295
+
296
+ def update_space_only_indent_column(self, column: int) -> None:
297
+ if self.first_line_indent_column != 0:
298
+ return
299
+ self.space_only_indent_column = column
300
+
301
+ def validate_indent_after_space_only(self, column: int) -> ta.Optional[YamlError]:
302
+ if self.first_line_indent_column != 0:
303
+ return None
304
+ if self.space_only_indent_column > column:
305
+ return yaml_error('invalid number of indent is specified after space only')
306
+ return None
307
+
308
+ def validate_indent_column(self) -> ta.Optional[YamlError]:
309
+ if first_line_indent_column_by_opt(self.opt) == 0:
310
+ return None
311
+ if self.first_line_indent_column > self.line_indent_column:
312
+ return yaml_error('invalid number of indent is specified in the multi-line header')
313
+ return None
314
+
315
+ def update_new_line_state(self) -> None:
316
+ self.prev_line_indent_column = self.line_indent_column
317
+ if self.line_indent_column != 0:
318
+ self.last_not_space_only_line_indent_column = self.line_indent_column
319
+ self.folded_new_line = True
320
+ self.line_indent_column = 0
321
+
322
+ def is_indent_column(self, column: int) -> bool:
323
+ if self.first_line_indent_column == 0:
324
+ return column == 1
325
+ return self.first_line_indent_column > column
326
+
327
+ def add_indent(self, ctx: YamlScanningContext, column: int) -> None:
328
+ if self.first_line_indent_column == 0:
329
+ return
330
+
331
+ # If the first line of the document has already been evaluated, the number is treated as the threshold, since
332
+ # the `first_line_indent_column` is a positive number.
333
+ if column < self.first_line_indent_column:
334
+ return
335
+
336
+ # `c.folded_new_line` is a variable that is set to True for every newline.
337
+ if not self.is_literal and self.folded_new_line:
338
+ self.folded_new_line = False
339
+
340
+ # Since add_buf ignore space character, add to the buffer directly.
341
+ ctx.buf += ' '
342
+ ctx.not_space_char_pos = len(ctx.buf)
343
+
344
+ # update_new_line_in_folded if Folded or RawFolded context and the content on the current line starts at the same
345
+ # column as the previous line, treat the new-line-char as a space.
346
+ def update_new_line_in_folded(self, ctx: YamlScanningContext, column: int) -> None:
347
+ if self.is_literal:
348
+ return
349
+
350
+ # Folded or RawFolded.
351
+
352
+ if not self.folded_new_line:
353
+ return
354
+
355
+ last_char = ''
356
+ prev_last_char = ''
357
+ if len(ctx.buf) != 0:
358
+ last_char = ctx.buf[len(ctx.buf) - 1]
359
+ if len(ctx.buf) > 1:
360
+ prev_last_char = ctx.buf[len(ctx.buf) - 2]
361
+
362
+ if self.line_indent_column == self.prev_line_indent_column:
363
+ # ---
364
+ # >
365
+ # a
366
+ # b
367
+ if last_char == '\n':
368
+ ctx.buf = ctx.buf[:-1] + ' '
369
+
370
+ elif self.prev_line_indent_column == 0 and self.last_not_space_only_line_indent_column == column:
371
+ # if previous line is indent-space and new-line-char only, prev_line_indent_column is zero. In this case,
372
+ # last new-line-char is removed.
373
+ # ---
374
+ # >
375
+ # a
376
+ #
377
+ # b
378
+ if last_char == '\n' and prev_last_char == '\n':
379
+ ctx.buf = ctx.buf[:len(ctx.buf) - 1]
380
+ ctx.not_space_char_pos = len(ctx.buf)
381
+
382
+ self.folded_new_line = False
383
+
384
+ def has_trim_all_end_newline_opt(self) -> bool:
385
+ return self.opt.startswith('-') or self.opt.endswith('-') or self.is_raw_folded
386
+
387
+ def has_keep_all_end_newline_opt(self) -> bool:
388
+ return self.opt.startswith('+') or self.opt.endswith('+')
389
+
390
+
391
+ ##
392
+
393
+
394
+ def first_line_indent_column_by_opt(opt: str) -> int:
395
+ opt = opt.lstrip('-')
396
+ opt = opt.lstrip('+')
397
+ opt = opt.rstrip('-')
398
+ opt = opt.rstrip('+')
399
+ try:
400
+ return int(opt, 10)
401
+ except ValueError:
402
+ return 0
403
+
404
+
405
+ ##
406
+
407
+
408
+ class YamlIndentState(enum.Enum):
409
+ # EQUAL equals previous indent
410
+ EQUAL = enum.auto()
411
+ # UP more indent than previous
412
+ UP = enum.auto()
413
+ # DOWN less indent than previous
414
+ DOWN = enum.auto()
415
+ # KEEP uses not indent token
416
+ KEEP = enum.auto()
417
+
418
+
419
+ # Scanner holds the scanner's internal state while processing a given text. It can be allocated as part of another data
420
+ # structure but must be initialized via init before use.
421
+ @dc.dataclass()
422
+ class YamlScanner:
423
+ source: str = ''
424
+ source_pos: int = 0
425
+ source_size: int = 0
426
+ # line number. This number starts from 1.
427
+ line: int = 0
428
+ # column number. This number starts from 1.
429
+ column: int = 0
430
+ # offset represents the offset from the beginning of the source.
431
+ offset: int = 0
432
+ # last_delim_column is the last column needed to compare indent is retained.
433
+ last_delim_column: int = 0
434
+ # indent_num indicates the number of spaces used for indentation.
435
+ indent_num: int = 0
436
+ # prev_line_indent_num indicates the number of spaces used for indentation at previous line.
437
+ prev_line_indent_num: int = 0
438
+ # indent_level indicates the level of indent depth. This value does not match the column value.
439
+ indent_level: int = 0
440
+ is_first_char_at_line: bool = False
441
+ is_anchor: bool = False
442
+ is_alias: bool = False
443
+ is_directive: bool = False
444
+ started_flow_sequence_num: int = 0
445
+ started_flow_map_num: int = 0
446
+ indent_state: YamlIndentState = YamlIndentState.EQUAL
447
+ saved_pos: ta.Optional[YamlPosition] = None
448
+
449
+ def pos(self) -> YamlPosition:
450
+ return YamlPosition(
451
+ line=self.line,
452
+ column=self.column,
453
+ offset=self.offset,
454
+ indent_num=self.indent_num,
455
+ indent_level=self.indent_level,
456
+ )
457
+
458
+ def buffered_token(self, ctx: YamlScanningContext) -> ta.Optional[YamlToken]:
459
+ if self.saved_pos is not None:
460
+ tk = ctx.buffered_token(self.saved_pos)
461
+ self.saved_pos = None
462
+ return tk
463
+
464
+ line = self.line
465
+ column = self.column - len(ctx.buf)
466
+ level = self.indent_level
467
+ if ctx.is_multi_line():
468
+ line -= self.new_line_count(ctx.buf)
469
+ column = ctx.obuf.find(ctx.buf) + 1
470
+ # Since we are in a literal, folded or raw folded we can use the indent level from the last token.
471
+ last = ctx.last_token()
472
+ if last is not None: # The last token should never be None here.
473
+ level = last.position.indent_level + 1
474
+
475
+ return ctx.buffered_token(YamlPosition(
476
+ line=line,
477
+ column=column,
478
+ offset=self.offset - len(ctx.buf),
479
+ indent_num=self.indent_num,
480
+ indent_level=level,
481
+ ))
482
+
483
+ def progress_column(self, ctx: YamlScanningContext, num: int) -> None:
484
+ self.column += num
485
+ self.offset += num
486
+ self.progress(ctx, num)
487
+
488
+ def progress_only(self, ctx: YamlScanningContext, num: int) -> None:
489
+ self.offset += num
490
+ self.progress(ctx, num)
491
+
492
+ def progress_line(self, ctx: YamlScanningContext) -> None:
493
+ self.prev_line_indent_num = self.indent_num
494
+ self.column = 1
495
+ self.line += 1
496
+ self.offset += 1
497
+ self.indent_num = 0
498
+ self.is_first_char_at_line = True
499
+ self.is_anchor = False
500
+ self.is_alias = False
501
+ self.is_directive = False
502
+ self.progress(ctx, 1)
503
+
504
+ def progress(self, ctx: YamlScanningContext, num: int) -> None:
505
+ ctx.progress(num)
506
+ self.source_pos += num
507
+
508
+ def is_new_line_char(self, c: str) -> bool:
509
+ if c == '\n':
510
+ return True
511
+ if c == '\r':
512
+ return True
513
+ return False
514
+
515
+ def new_line_count(self, src: str) -> int:
516
+ size = len(src)
517
+ cnt = 0
518
+ i = -1
519
+ while True:
520
+ i += 1
521
+ if not (i < size):
522
+ break
523
+ c = src[i]
524
+ if c == '\r':
525
+ if i + 1 < size and src[i + 1] == '\n':
526
+ i += 1
527
+ cnt += 1
528
+ elif c == '\n':
529
+ cnt += 1
530
+ return cnt
531
+
532
+ def update_indent_level(self) -> None:
533
+ if self.prev_line_indent_num < self.indent_num:
534
+ self.indent_level += 1
535
+ elif self.prev_line_indent_num > self.indent_num:
536
+ if self.indent_level > 0:
537
+ self.indent_level -= 1
538
+
539
+ def update_indent_state(self, ctx: YamlScanningContext) -> None:
540
+ if self.last_delim_column == 0:
541
+ return
542
+
543
+ if self.last_delim_column < self.column:
544
+ self.indent_state = YamlIndentState.UP
545
+ else:
546
+ # If last_delim_column and self.column are the same, treat as Down state since it is the same column as
547
+ # delimiter.
548
+ self.indent_state = YamlIndentState.DOWN
549
+
550
+ def update_indent(self, ctx: YamlScanningContext, c: str) -> None:
551
+ if self.is_first_char_at_line and self.is_new_line_char(c):
552
+ return
553
+ if self.is_first_char_at_line and c == ' ':
554
+ self.indent_num += 1
555
+ return
556
+ if self.is_first_char_at_line and c == '\t':
557
+ # Found tab indent. In this case, scan_tab returns error.
558
+ return
559
+ if not self.is_first_char_at_line:
560
+ self.indent_state = YamlIndentState.KEEP
561
+ return
562
+ self.update_indent_level()
563
+ self.update_indent_state(ctx)
564
+ self.is_first_char_at_line = False
565
+
566
+ def is_changed_to_indent_state_down(self) -> bool:
567
+ return self.indent_state == YamlIndentState.DOWN
568
+
569
+ def is_changed_to_indent_state_up(self) -> bool:
570
+ return self.indent_state == YamlIndentState.UP
571
+
572
+ def add_buffered_token_if_exists(self, ctx: YamlScanningContext) -> None:
573
+ ctx.add_token(self.buffered_token(ctx))
574
+
575
+ def break_multi_line(self, ctx: YamlScanningContext) -> None:
576
+ ctx.break_multi_line()
577
+
578
+ def scan_single_quote(self, ctx: YamlScanningContext) -> YamlErrorOr[YamlToken]:
579
+ ctx.add_origin_buf("'")
580
+ srcpos = self.pos()
581
+ start_index = ctx.idx + 1
582
+ src = ctx.src
583
+ size = len(src)
584
+ value = ''
585
+ is_first_line_char = False
586
+ is_new_line = False
587
+
588
+ idx = start_index - 1
589
+ while True:
590
+ idx += 1
591
+ if not (idx < size):
592
+ break
593
+
594
+ if not is_new_line:
595
+ self.progress_column(ctx, 1)
596
+ else:
597
+ is_new_line = False
598
+
599
+ c = src[idx]
600
+ ctx.add_origin_buf(c)
601
+ if self.is_new_line_char(c):
602
+ not_space_idx = -1
603
+ for i in range(len(value) - 1, -1, -1):
604
+ if value[i] == ' ':
605
+ continue
606
+ not_space_idx = i
607
+ break
608
+
609
+ if len(value) > not_space_idx:
610
+ value = value[:not_space_idx + 1]
611
+ if is_first_line_char:
612
+ value += '\n'
613
+ else:
614
+ value += ' '
615
+
616
+ is_first_line_char = True
617
+ is_new_line = True
618
+ self.progress_line(ctx)
619
+ if idx + 1 < size:
620
+ if (err := self.validate_document_separator_marker(ctx, src[idx + 1:])) is not None:
621
+ return err
622
+
623
+ continue
624
+
625
+ if is_first_line_char and c == ' ':
626
+ continue
627
+
628
+ if is_first_line_char and c == '\t':
629
+ if self.last_delim_column >= self.column:
630
+ return err_invalid_token(
631
+ YamlTokenMakers.new_invalid(
632
+ yaml_error('tab character cannot be used for indentation in single-quoted text'),
633
+ ctx.obuf,
634
+ self.pos(),
635
+ ),
636
+ )
637
+
638
+ continue
639
+
640
+ if c != "'":
641
+ value += c
642
+ is_first_line_char = False
643
+ continue
644
+
645
+ if idx + 1 < len(ctx.src) and ctx.src[idx + 1] == '\'':
646
+ # '' handle as ' character
647
+ value += c
648
+ ctx.add_origin_buf(c)
649
+ idx += 1
650
+ self.progress_column(ctx, 1)
651
+ continue
652
+
653
+ self.progress_column(ctx, 1)
654
+ return YamlTokenMakers.new_single_quote(value, ctx.obuf, srcpos)
655
+
656
+ self.progress_column(ctx, 1)
657
+ return err_invalid_token(
658
+ YamlTokenMakers.new_invalid(
659
+ yaml_error('could not find end character of single-quoted text'),
660
+ ctx.obuf,
661
+ srcpos,
662
+ ),
663
+ )
664
+
665
+ def scan_double_quote(self, ctx: YamlScanningContext) -> YamlErrorOr[YamlToken]:
666
+ ctx.add_origin_buf('"')
667
+ srcpos = self.pos()
668
+ start_index = ctx.idx + 1
669
+ src = ctx.src
670
+ size = len(src)
671
+ value = ''
672
+ is_first_line_char = False
673
+ is_new_line = False
674
+
675
+ idx = start_index - 1
676
+ while True:
677
+ idx += 1
678
+ if not (idx < size):
679
+ break
680
+
681
+ if not is_new_line:
682
+ self.progress_column(ctx, 1)
683
+ else:
684
+ is_new_line = False
685
+
686
+ c = src[idx]
687
+ ctx.add_origin_buf(c)
688
+ if self.is_new_line_char(c):
689
+ not_space_idx = -1
690
+ for i in range(len(value) - 1, -1, -1):
691
+ if value[i] == ' ':
692
+ continue
693
+ not_space_idx = i
694
+ break
695
+
696
+ if len(value) > not_space_idx:
697
+ value = value[:not_space_idx + 1]
698
+
699
+ if is_first_line_char:
700
+ value += '\n'
701
+ else:
702
+ value += ' '
703
+
704
+ is_first_line_char = True
705
+ is_new_line = True
706
+ self.progress_line(ctx)
707
+ if idx + 1 < size:
708
+ if (err := self.validate_document_separator_marker(ctx, src[idx + 1:])) is not None:
709
+ return err
710
+
711
+ continue
712
+
713
+ if is_first_line_char and c == ' ':
714
+ continue
715
+
716
+ if is_first_line_char and c == '\t':
717
+ if self.last_delim_column >= self.column:
718
+ return err_invalid_token(
719
+ YamlTokenMakers.new_invalid(
720
+ yaml_error('tab character cannot be used for indentation in double-quoted text'),
721
+ ctx.obuf,
722
+ self.pos(),
723
+ ),
724
+ )
725
+
726
+ continue
727
+
728
+ if c == '\\':
729
+ is_first_line_char = False
730
+ if idx + 1 >= size:
731
+ value += c
732
+ continue
733
+
734
+ next_char = src[idx + 1]
735
+ progress = 0
736
+
737
+ if next_char == '0':
738
+ progress = 1
739
+ ctx.add_origin_buf(next_char)
740
+ value += chr(0)
741
+ elif next_char == 'a':
742
+ progress = 1
743
+ ctx.add_origin_buf(next_char)
744
+ value += '\x07'
745
+ elif next_char == 'b':
746
+ progress = 1
747
+ ctx.add_origin_buf(next_char)
748
+ value += '\x08'
749
+ elif next_char == 't':
750
+ progress = 1
751
+ ctx.add_origin_buf(next_char)
752
+ value += '\x09'
753
+ elif next_char == 'n':
754
+ progress = 1
755
+ ctx.add_origin_buf(next_char)
756
+ value += '\x0A'
757
+ elif next_char == 'v':
758
+ progress = 1
759
+ ctx.add_origin_buf(next_char)
760
+ value += '\x0B'
761
+ elif next_char == 'f':
762
+ progress = 1
763
+ ctx.add_origin_buf(next_char)
764
+ value += '\x0C'
765
+ elif next_char == 'r':
766
+ progress = 1
767
+ ctx.add_origin_buf(next_char)
768
+ value += '\x0D'
769
+ elif next_char == 'e':
770
+ progress = 1
771
+ ctx.add_origin_buf(next_char)
772
+ value += '\x1B'
773
+ elif next_char == ' ':
774
+ progress = 1
775
+ ctx.add_origin_buf(next_char)
776
+ value += '\x20'
777
+ elif next_char == '"':
778
+ progress = 1
779
+ ctx.add_origin_buf(next_char)
780
+ value += '\x22'
781
+ elif next_char == '/':
782
+ progress = 1
783
+ ctx.add_origin_buf(next_char)
784
+ value += '\x2F'
785
+ elif next_char == '\\':
786
+ progress = 1
787
+ ctx.add_origin_buf(next_char)
788
+ value += '\x5C'
789
+ elif next_char == 'N':
790
+ progress = 1
791
+ ctx.add_origin_buf(next_char)
792
+ value += '\x85'
793
+ elif next_char == '_':
794
+ progress = 1
795
+ ctx.add_origin_buf(next_char)
796
+ value += '\xA0'
797
+ elif next_char == 'L':
798
+ progress = 1
799
+ ctx.add_origin_buf(next_char)
800
+ value += '\u2028'
801
+ elif next_char == 'P':
802
+ progress = 1
803
+ ctx.add_origin_buf(next_char)
804
+ value += '\u2029'
805
+
806
+ elif next_char == 'x':
807
+ if idx + 3 >= size:
808
+ progress = 1
809
+ ctx.add_origin_buf(next_char)
810
+ value += next_char
811
+ else:
812
+ progress = 3
813
+ code_num = hex_runes_to_int(src[idx + 2: idx + progress + 1])
814
+ value += chr(code_num)
815
+
816
+ elif next_char == 'u':
817
+ # \u0000 style must have 5 characters at least.
818
+ if idx + 5 >= size:
819
+ return err_invalid_token(
820
+ YamlTokenMakers.new_invalid(
821
+ yaml_error('not enough length for escaped UTF-16 character'),
822
+ ctx.obuf,
823
+ self.pos(),
824
+ ),
825
+ )
826
+
827
+ progress = 5
828
+ code_num = hex_runes_to_int(src[idx + 2: idx + 6])
829
+
830
+ # handle surrogate pairs.
831
+ if code_num >= 0xD800 and code_num <= 0xDBFF:
832
+ high = code_num
833
+
834
+ # \u0000\u0000 style must have 11 characters at least.
835
+ if idx + 11 >= size:
836
+ return err_invalid_token(
837
+ YamlTokenMakers.new_invalid(
838
+ yaml_error('not enough length for escaped UTF-16 surrogate pair'),
839
+ ctx.obuf,
840
+ self.pos(),
841
+ ),
842
+ )
843
+
844
+ if src[idx + 6] != '\\' or src[idx + 7] != 'u':
845
+ return err_invalid_token(
846
+ YamlTokenMakers.new_invalid(
847
+ yaml_error('found unexpected character after high surrogate for UTF-16 surrogate pair'), # noqa
848
+ ctx.obuf,
849
+ self.pos(),
850
+ ),
851
+ )
852
+
853
+ low = hex_runes_to_int(src[idx + 8: idx + 12])
854
+ if low < 0xDC00 or low > 0xDFFF:
855
+ return err_invalid_token(
856
+ YamlTokenMakers.new_invalid(
857
+ yaml_error('found unexpected low surrogate after high surrogate'),
858
+ ctx.obuf,
859
+ self.pos(),
860
+ ),
861
+ )
862
+
863
+ code_num = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
864
+ progress += 6
865
+
866
+ value += chr(code_num)
867
+
868
+ elif next_char == 'U':
869
+ # \U00000000 style must have 9 characters at least.
870
+ if idx + 9 >= size:
871
+ return err_invalid_token(
872
+ YamlTokenMakers.new_invalid(
873
+ yaml_error('not enough length for escaped UTF-32 character'),
874
+ ctx.obuf,
875
+ self.pos(),
876
+ ),
877
+ )
878
+
879
+ progress = 9
880
+ code_num = hex_runes_to_int(src[idx + 2: idx + 10])
881
+ value += chr(code_num)
882
+
883
+ elif next_char == '\n':
884
+ is_first_line_char = True
885
+ is_new_line = True
886
+ ctx.add_origin_buf(next_char)
887
+ self.progress_column(ctx, 1)
888
+ self.progress_line(ctx)
889
+ idx += 1
890
+ continue
891
+
892
+ elif next_char == '\r':
893
+ is_first_line_char = True
894
+ is_new_line = True
895
+ ctx.add_origin_buf(next_char)
896
+ self.progress_line(ctx)
897
+ progress = 1
898
+ # Skip \n after \r in CRLF sequences
899
+ if idx + 2 < size and src[idx + 2] == '\n':
900
+ ctx.add_origin_buf('\n')
901
+ progress = 2
902
+
903
+ elif next_char == '\t':
904
+ progress = 1
905
+ ctx.add_origin_buf(next_char)
906
+ value += next_char
907
+
908
+ else:
909
+ self.progress_column(ctx, 1)
910
+ return err_invalid_token(
911
+ YamlTokenMakers.new_invalid(
912
+ yaml_error(f'found unknown escape character {next_char!r}'),
913
+ ctx.obuf,
914
+ self.pos(),
915
+ ),
916
+ )
917
+
918
+ idx += progress
919
+ self.progress_column(ctx, progress)
920
+ continue
921
+
922
+ if c == '\t':
923
+ found_not_space_char = False
924
+ progress = 0
925
+
926
+ for i in range(idx + 1, size):
927
+ if src[i] == ' ' or src[i] == '\t':
928
+ progress += 1
929
+ continue
930
+
931
+ if self.is_new_line_char(src[i]):
932
+ break
933
+
934
+ found_not_space_char = True
935
+
936
+ if found_not_space_char:
937
+ value += c
938
+ if src[idx + 1] != '"':
939
+ self.progress_column(ctx, 1)
940
+
941
+ else:
942
+ idx += progress
943
+ self.progress_column(ctx, progress)
944
+
945
+ continue
946
+
947
+ if c != '"':
948
+ value += c
949
+ is_first_line_char = False
950
+ continue
951
+
952
+ self.progress_column(ctx, 1)
953
+ return YamlTokenMakers.new_double_quote(value, ctx.obuf, srcpos)
954
+
955
+ self.progress_column(ctx, 1)
956
+ return err_invalid_token(
957
+ YamlTokenMakers.new_invalid(
958
+ yaml_error('could not find end character of double-quoted text'),
959
+ ctx.obuf,
960
+ srcpos,
961
+ ),
962
+ )
963
+
964
+ def validate_document_separator_marker(self, ctx: YamlScanningContext, src: str) -> ta.Optional[YamlError]:
965
+ if self.found_document_separator_marker(src):
966
+ return err_invalid_token(
967
+ YamlTokenMakers.new_invalid(yaml_error('found unexpected document separator'), ctx.obuf, self.pos()),
968
+ )
969
+
970
+ return None
971
+
972
+ def found_document_separator_marker(self, src: str) -> bool:
973
+ if len(src) < 3:
974
+ return False
975
+
976
+ marker = ''
977
+ if len(src) == 3:
978
+ marker = src
979
+ else:
980
+ marker = trim_right_func(src[:4], lambda r: r == ' ' or r == '\t' or r == '\n' or r == '\r')
981
+
982
+ return marker == '---' or marker == '...'
983
+
984
+ def scan_quote(self, ctx: YamlScanningContext, ch: str) -> YamlErrorOr[bool]:
985
+ if ctx.exists_buffer():
986
+ return False
987
+
988
+ if ch == "'":
989
+ tk = self.scan_single_quote(ctx)
990
+ if isinstance(tk, YamlError):
991
+ return tk
992
+
993
+ ctx.add_token(tk)
994
+
995
+ else:
996
+ tk = self.scan_double_quote(ctx)
997
+ if isinstance(tk, YamlError):
998
+ return tk
999
+
1000
+ ctx.add_token(tk)
1001
+
1002
+ ctx.clear()
1003
+ return True
1004
+
1005
+ def scan_white_space(self, ctx: YamlScanningContext) -> bool:
1006
+ if ctx.is_multi_line():
1007
+ return False
1008
+
1009
+ if not self.is_anchor and not self.is_directive and not self.is_alias and not self.is_first_char_at_line:
1010
+ return False
1011
+
1012
+ if self.is_first_char_at_line:
1013
+ self.progress_column(ctx, 1)
1014
+ ctx.add_origin_buf(' ')
1015
+ return True
1016
+
1017
+ if self.is_directive:
1018
+ self.add_buffered_token_if_exists(ctx)
1019
+ self.progress_column(ctx, 1)
1020
+ ctx.add_origin_buf(' ')
1021
+ return True
1022
+
1023
+ self.add_buffered_token_if_exists(ctx)
1024
+ self.is_anchor = False
1025
+ self.is_alias = False
1026
+ return True
1027
+
1028
+ def is_merge_key(self, ctx: YamlScanningContext) -> bool:
1029
+ if ctx.repeat_num('<') != 2:
1030
+ return False
1031
+
1032
+ src = ctx.src
1033
+ size = len(src)
1034
+ for idx in range(ctx.idx + 2, size):
1035
+ c = src[idx]
1036
+ if c == ' ':
1037
+ continue
1038
+
1039
+ if c != ':':
1040
+ return False
1041
+
1042
+ if idx + 1 < size:
1043
+ nc = src[idx + 1]
1044
+ if nc == ' ' or self.is_new_line_char(nc):
1045
+ return True
1046
+
1047
+ return False
1048
+
1049
+ def scan_tag(self, ctx: YamlScanningContext) -> YamlErrorOr[bool]:
1050
+ if ctx.exists_buffer() or self.is_directive:
1051
+ return False
1052
+
1053
+ ctx.add_origin_buf('!')
1054
+ self.progress(ctx, 1) # skip '!' character
1055
+
1056
+ progress = 0
1057
+ for idx, c in enumerate(ctx.src[ctx.idx:]):
1058
+ progress = idx + 1
1059
+
1060
+ if c == ' ':
1061
+ ctx.add_origin_buf(c)
1062
+ value = ctx.source(ctx.idx - 1, ctx.idx + idx)
1063
+ ctx.add_token(YamlTokenMakers.new_tag(value, ctx.obuf, self.pos()))
1064
+ self.progress_column(ctx, len(value))
1065
+ ctx.clear()
1066
+ return True
1067
+
1068
+ elif c == ',':
1069
+ if self.started_flow_sequence_num > 0 or self.started_flow_map_num > 0:
1070
+ value = ctx.source(ctx.idx - 1, ctx.idx + idx)
1071
+ ctx.add_token(YamlTokenMakers.new_tag(value, ctx.obuf, self.pos()))
1072
+ # progress column before collect-entry for scanning it at scan_flow_entry function.
1073
+ self.progress_column(ctx, len(value) - 1)
1074
+ ctx.clear()
1075
+ return True
1076
+ else:
1077
+ ctx.add_origin_buf(c)
1078
+
1079
+ elif c in ('\n', '\r'):
1080
+ ctx.add_origin_buf(c)
1081
+ value = ctx.source(ctx.idx - 1, ctx.idx + idx)
1082
+ ctx.add_token(YamlTokenMakers.new_tag(value, ctx.obuf, self.pos()))
1083
+ # progress column before new-line-char for scanning new-line-char at scan_new_line function.
1084
+ self.progress_column(ctx, len(value) - 1)
1085
+ ctx.clear()
1086
+ return True
1087
+
1088
+ elif c in ('{', '}'):
1089
+ ctx.add_origin_buf(c)
1090
+ self.progress_column(ctx, progress)
1091
+ invalid_tk = YamlTokenMakers.new_invalid(
1092
+ yaml_error(f'found invalid tag character {c!r}'),
1093
+ ctx.obuf,
1094
+ self.pos(),
1095
+ )
1096
+ return err_invalid_token(invalid_tk)
1097
+
1098
+ else:
1099
+ ctx.add_origin_buf(c)
1100
+
1101
+ self.progress_column(ctx, progress)
1102
+ ctx.clear()
1103
+ return True
1104
+
1105
+ def scan_comment(self, ctx: YamlScanningContext) -> bool:
1106
+ if ctx.exists_buffer():
1107
+ c = ctx.previous_char()
1108
+ if c != ' ' and c != '\t' and not self.is_new_line_char(c):
1109
+ return False
1110
+
1111
+ self.add_buffered_token_if_exists(ctx)
1112
+ ctx.add_origin_buf('#')
1113
+ self.progress(ctx, 1) # skip '#' character
1114
+
1115
+ for idx, c in enumerate(ctx.src[ctx.idx:]):
1116
+ ctx.add_origin_buf(c)
1117
+ if not self.is_new_line_char(c):
1118
+ continue
1119
+ if ctx.previous_char() == '\\':
1120
+ continue
1121
+
1122
+ value = ctx.source(ctx.idx, ctx.idx + idx)
1123
+ progress = len(value)
1124
+ ctx.add_token(YamlTokenMakers.new_comment(value, ctx.obuf, self.pos()))
1125
+ self.progress_column(ctx, progress)
1126
+ self.progress_line(ctx)
1127
+ ctx.clear()
1128
+ return True
1129
+
1130
+ # document ends with comment.
1131
+ value = ctx.src[ctx.idx:]
1132
+ ctx.add_token(YamlTokenMakers.new_comment(value, ctx.obuf, self.pos()))
1133
+ progress = len(value)
1134
+ self.progress_column(ctx, progress)
1135
+ self.progress_line(ctx)
1136
+ ctx.clear()
1137
+ return True
1138
+
1139
+ def scan_multi_line(self, ctx: YamlScanningContext, c: str) -> ta.Optional[YamlError]:
1140
+ state = check.not_none(ctx.get_multi_line_state())
1141
+ ctx.add_origin_buf(c)
1142
+
1143
+ if ctx.is_eos():
1144
+ if self.is_first_char_at_line and c == ' ':
1145
+ state.add_indent(ctx, self.column)
1146
+ else:
1147
+ ctx.add_buf(c)
1148
+
1149
+ state.update_indent_column(self.column)
1150
+ if (err := state.validate_indent_column()) is not None:
1151
+ invalid_tk = YamlTokenMakers.new_invalid(yaml_error(str(err)), ctx.obuf, self.pos())
1152
+ self.progress_column(ctx, 1)
1153
+ return err_invalid_token(invalid_tk)
1154
+
1155
+ value = ctx.buffered_src()
1156
+ ctx.add_token(YamlTokenMakers.new_string(value, ctx.obuf, self.pos()))
1157
+ ctx.clear()
1158
+ self.progress_column(ctx, 1)
1159
+
1160
+ elif self.is_new_line_char(c):
1161
+ ctx.add_buf(c)
1162
+ state.update_space_only_indent_column(self.column - 1)
1163
+ state.update_new_line_state()
1164
+ self.progress_line(ctx)
1165
+ if ctx.next():
1166
+ if self.found_document_separator_marker(ctx.src[ctx.idx:]):
1167
+ value = ctx.buffered_src()
1168
+ ctx.add_token(YamlTokenMakers.new_string(value, ctx.obuf, self.pos()))
1169
+ ctx.clear()
1170
+ self.break_multi_line(ctx)
1171
+
1172
+ elif self.is_first_char_at_line and c == ' ':
1173
+ state.add_indent(ctx, self.column)
1174
+ self.progress_column(ctx, 1)
1175
+
1176
+ elif self.is_first_char_at_line and c == '\t' and state.is_indent_column(self.column):
1177
+ err = err_invalid_token(
1178
+ YamlTokenMakers.new_invalid(
1179
+ yaml_error('found a tab character where an indentation space is expected'),
1180
+ ctx.obuf,
1181
+ self.pos(),
1182
+ ),
1183
+ )
1184
+ self.progress_column(ctx, 1)
1185
+ return err
1186
+
1187
+ elif c == '\t' and not state.is_indent_column(self.column):
1188
+ ctx.add_buf_with_tab(c)
1189
+ self.progress_column(ctx, 1)
1190
+
1191
+ else:
1192
+ if (err := state.validate_indent_after_space_only(self.column)) is not None:
1193
+ invalid_tk = YamlTokenMakers.new_invalid(yaml_error(str(err)), ctx.obuf, self.pos())
1194
+ self.progress_column(ctx, 1)
1195
+ return err_invalid_token(invalid_tk)
1196
+
1197
+ state.update_indent_column(self.column)
1198
+ if (err := state.validate_indent_column()) is not None:
1199
+ invalid_tk = YamlTokenMakers.new_invalid(yaml_error(str(err)), ctx.obuf, self.pos())
1200
+ self.progress_column(ctx, 1)
1201
+ return err_invalid_token(invalid_tk)
1202
+
1203
+ if (col := state.last_delim_column()) > 0:
1204
+ self.last_delim_column = col
1205
+
1206
+ state.update_new_line_in_folded(ctx, self.column)
1207
+ ctx.add_buf_with_tab(c)
1208
+ self.progress_column(ctx, 1)
1209
+
1210
+ return None
1211
+
1212
+ def scan_new_line(self, ctx: YamlScanningContext, c: str) -> None:
1213
+ if len(ctx.buf) > 0 and self.saved_pos is None:
1214
+ buf_len = len(ctx.buffered_src())
1215
+ self.saved_pos = self.pos()
1216
+ self.saved_pos.column -= buf_len
1217
+ self.saved_pos.offset -= buf_len
1218
+
1219
+ # if the following case, origin buffer has unnecessary two spaces.
1220
+ # So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
1221
+ # ---
1222
+ # a:[space][space]
1223
+ # b: c
1224
+ ctx.remove_right_space_from_buf()
1225
+
1226
+ # There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following
1227
+ # YAML1.2 spec.
1228
+ # > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be
1229
+ # parsed into a single line feed character.
1230
+ # > Outside scalar content, YAML allows any line break to be used to terminate lines.
1231
+ # > -- https://yaml.org/spec/1.2/spec.html
1232
+ if c == '\r' and ctx.next_char() == '\n':
1233
+ ctx.add_origin_buf('\r')
1234
+ self.progress(ctx, 1)
1235
+ self.offset += 1
1236
+ c = '\n'
1237
+
1238
+ if ctx.is_eos():
1239
+ self.add_buffered_token_if_exists(ctx)
1240
+ elif self.is_anchor or self.is_alias or self.is_directive:
1241
+ self.add_buffered_token_if_exists(ctx)
1242
+
1243
+ if ctx.exists_buffer() and self.is_first_char_at_line:
1244
+ if ctx.buf[len(ctx.buf) - 1] == ' ':
1245
+ ctx.buf = ctx.buf[:-1] + '\n'
1246
+ else:
1247
+ ctx.buf += '\n'
1248
+ else:
1249
+ ctx.add_buf(' ')
1250
+
1251
+ ctx.add_origin_buf(c)
1252
+ self.progress_line(ctx)
1253
+
1254
+ def is_flow_mode(self) -> bool:
1255
+ if self.started_flow_sequence_num > 0:
1256
+ return True
1257
+
1258
+ if self.started_flow_map_num > 0:
1259
+ return True
1260
+
1261
+ return False
1262
+
1263
+ def scan_flow_map_start(self, ctx: YamlScanningContext) -> bool:
1264
+ if ctx.exists_buffer() and not self.is_flow_mode():
1265
+ return False
1266
+
1267
+ self.add_buffered_token_if_exists(ctx)
1268
+ ctx.add_origin_buf('{')
1269
+ ctx.add_token(YamlTokenMakers.new_mapping_start(ctx.obuf, self.pos()))
1270
+ self.started_flow_map_num += 1
1271
+ self.progress_column(ctx, 1)
1272
+ ctx.clear()
1273
+ return True
1274
+
1275
+ def scan_flow_map_end(self, ctx: YamlScanningContext) -> bool:
1276
+ if self.started_flow_map_num <= 0:
1277
+ return False
1278
+
1279
+ self.add_buffered_token_if_exists(ctx)
1280
+ ctx.add_origin_buf('}')
1281
+ ctx.add_token(YamlTokenMakers.new_mapping_end(ctx.obuf, self.pos()))
1282
+ self.started_flow_map_num -= 1
1283
+ self.progress_column(ctx, 1)
1284
+ ctx.clear()
1285
+ return True
1286
+
1287
+ def scan_flow_array_start(self, ctx: YamlScanningContext) -> bool:
1288
+ if ctx.exists_buffer() and not self.is_flow_mode():
1289
+ return False
1290
+
1291
+ self.add_buffered_token_if_exists(ctx)
1292
+ ctx.add_origin_buf('[')
1293
+ ctx.add_token(YamlTokenMakers.new_sequence_start(ctx.obuf, self.pos()))
1294
+ self.started_flow_sequence_num += 1
1295
+ self.progress_column(ctx, 1)
1296
+ ctx.clear()
1297
+ return True
1298
+
1299
+ def scan_flow_array_end(self, ctx: YamlScanningContext) -> bool:
1300
+ if ctx.exists_buffer() and self.started_flow_sequence_num <= 0:
1301
+ return False
1302
+
1303
+ self.add_buffered_token_if_exists(ctx)
1304
+ ctx.add_origin_buf(']')
1305
+ ctx.add_token(YamlTokenMakers.new_sequence_end(ctx.obuf, self.pos()))
1306
+ self.started_flow_sequence_num -= 1
1307
+ self.progress_column(ctx, 1)
1308
+ ctx.clear()
1309
+ return True
1310
+
1311
+ def scan_flow_entry(self, ctx: YamlScanningContext, c: str) -> bool:
1312
+ if self.started_flow_sequence_num <= 0 and self.started_flow_map_num <= 0:
1313
+ return False
1314
+
1315
+ self.add_buffered_token_if_exists(ctx)
1316
+ ctx.add_origin_buf(c)
1317
+ ctx.add_token(YamlTokenMakers.new_collect_entry(ctx.obuf, self.pos()))
1318
+ self.progress_column(ctx, 1)
1319
+ ctx.clear()
1320
+ return True
1321
+
1322
+ def scan_map_delim(self, ctx: YamlScanningContext) -> YamlErrorOr[bool]:
1323
+ nc = ctx.next_char()
1324
+ if self.is_directive or self.is_anchor or self.is_alias:
1325
+ return False
1326
+
1327
+ if (
1328
+ self.started_flow_map_num <= 0 and
1329
+ nc != ' ' and
1330
+ nc != '\t' and
1331
+ not self.is_new_line_char(nc) and
1332
+ not ctx.is_next_eos()
1333
+ ):
1334
+ return False
1335
+
1336
+ if self.started_flow_map_num > 0 and nc == '/':
1337
+ # like http://
1338
+ return False
1339
+
1340
+ if self.started_flow_map_num > 0:
1341
+ tk = ctx.last_token()
1342
+ if tk is not None and tk.type == YamlTokenType.MAPPING_VALUE:
1343
+ return False
1344
+
1345
+ if ctx.obuf.lstrip(' ').startswith('\t') and not ctx.buf.startswith('\t'):
1346
+ invalid_tk = YamlTokenMakers.new_invalid(
1347
+ yaml_error('tab character cannot use as a map key directly'),
1348
+ ctx.obuf,
1349
+ self.pos(),
1350
+ )
1351
+ self.progress_column(ctx, 1)
1352
+ return err_invalid_token(invalid_tk)
1353
+
1354
+ # mapping value
1355
+ tk = self.buffered_token(ctx)
1356
+ if tk is not None:
1357
+ self.last_delim_column = tk.position.column
1358
+ ctx.add_token(tk)
1359
+
1360
+ elif (tk := ctx.last_token()) is not None:
1361
+ # If the map key is quote, the buffer does not exist because it has already been cut into tokens.
1362
+ # Therefore, we need to check the last token.
1363
+ if tk.indicator == YamlIndicator.QUOTED_SCALAR:
1364
+ self.last_delim_column = tk.position.column
1365
+
1366
+ ctx.add_token(YamlTokenMakers.new_mapping_value(self.pos()))
1367
+ self.progress_column(ctx, 1)
1368
+ ctx.clear()
1369
+ return True
1370
+
1371
+ def scan_document_start(self, ctx: YamlScanningContext) -> bool:
1372
+ if self.indent_num != 0:
1373
+ return False
1374
+
1375
+ if self.column != 1:
1376
+ return False
1377
+
1378
+ if ctx.repeat_num('-') != 3:
1379
+ return False
1380
+
1381
+ if ctx.size > ctx.idx + 3:
1382
+ c = ctx.src[ctx.idx + 3]
1383
+ if c != ' ' and c != '\t' and c != '\n' and c != '\r':
1384
+ return False
1385
+
1386
+ self.add_buffered_token_if_exists(ctx)
1387
+ ctx.add_token(YamlTokenMakers.new_document_header(ctx.obuf + '---', self.pos()))
1388
+ self.progress_column(ctx, 3)
1389
+ ctx.clear()
1390
+ self.clear_state()
1391
+ return True
1392
+
1393
+ def scan_document_end(self, ctx: YamlScanningContext) -> bool:
1394
+ if self.indent_num != 0:
1395
+ return False
1396
+
1397
+ if self.column != 1:
1398
+ return False
1399
+
1400
+ if ctx.repeat_num('.') != 3:
1401
+ return False
1402
+
1403
+ self.add_buffered_token_if_exists(ctx)
1404
+ ctx.add_token(YamlTokenMakers.new_document_end(ctx.obuf + '...', self.pos()))
1405
+ self.progress_column(ctx, 3)
1406
+ ctx.clear()
1407
+ return True
1408
+
1409
+ def scan_merge_key(self, ctx: YamlScanningContext) -> bool:
1410
+ if not self.is_merge_key(ctx):
1411
+ return False
1412
+
1413
+ self.last_delim_column = self.column
1414
+ ctx.add_token(YamlTokenMakers.new_merge_key(ctx.obuf + '<<', self.pos()))
1415
+ self.progress_column(ctx, 2)
1416
+ ctx.clear()
1417
+ return True
1418
+
1419
+ def scan_raw_folded_char(self, ctx: YamlScanningContext) -> bool:
1420
+ if not ctx.exists_buffer():
1421
+ return False
1422
+
1423
+ if not self.is_changed_to_indent_state_up():
1424
+ return False
1425
+
1426
+ ctx.set_raw_folded(self.column)
1427
+ ctx.add_buf('-')
1428
+ ctx.add_origin_buf('-')
1429
+ self.progress_column(ctx, 1)
1430
+ return True
1431
+
1432
+ def scan_sequence(self, ctx: YamlScanningContext) -> YamlErrorOr[bool]:
1433
+ if ctx.exists_buffer():
1434
+ return False
1435
+
1436
+ nc = ctx.next_char()
1437
+ if nc != 0 and nc != ' ' and nc != '\t' and not self.is_new_line_char(nc):
1438
+ return False
1439
+
1440
+ if ctx.obuf.lstrip(' ').startswith('\t'):
1441
+ invalid_tk = YamlTokenMakers.new_invalid(
1442
+ yaml_error('tab character cannot use as a sequence delimiter'),
1443
+ ctx.obuf,
1444
+ self.pos(),
1445
+ )
1446
+ self.progress_column(ctx, 1)
1447
+ return err_invalid_token(invalid_tk)
1448
+
1449
+ self.add_buffered_token_if_exists(ctx)
1450
+ ctx.add_origin_buf('-')
1451
+ tk = YamlTokenMakers.new_sequence_entry(ctx.obuf, self.pos())
1452
+ self.last_delim_column = tk.position.column
1453
+ ctx.add_token(tk)
1454
+ self.progress_column(ctx, 1)
1455
+ ctx.clear()
1456
+ return True
1457
+
1458
+ def scan_multi_line_header(self, ctx: YamlScanningContext) -> YamlErrorOr[bool]:
1459
+ if ctx.exists_buffer():
1460
+ return False
1461
+
1462
+ if (err := self.scan_multi_line_header_option(ctx)) is not None:
1463
+ return err
1464
+
1465
+ self.progress_line(ctx)
1466
+ return True
1467
+
1468
+ def validate_multi_line_header_option(self, opt: str) -> ta.Optional[YamlError]:
1469
+ if len(opt) == 0:
1470
+ return None
1471
+
1472
+ org_opt = opt
1473
+ opt = opt.lstrip('-')
1474
+ opt = opt.lstrip('+')
1475
+ opt = opt.rstrip('-')
1476
+ opt = opt.rstrip('+')
1477
+ if len(opt) == 0:
1478
+ return None
1479
+
1480
+ if opt == '0':
1481
+ return yaml_error(f'invalid header option: {org_opt!r}')
1482
+
1483
+ try:
1484
+ i = int(opt, 10)
1485
+ except ValueError:
1486
+ return yaml_error(f'invalid header option: {org_opt!r}')
1487
+
1488
+ if i > 9:
1489
+ return yaml_error(f'invalid header option: {org_opt!r}')
1490
+
1491
+ return None
1492
+
1493
+ def scan_multi_line_header_option(self, ctx: YamlScanningContext) -> ta.Optional[YamlError]:
1494
+ header = ctx.current_char()
1495
+ ctx.add_origin_buf(header)
1496
+ self.progress(ctx, 1) # skip '|' or '>' character
1497
+
1498
+ progress = 0
1499
+ for idx, c in enumerate(ctx.src[ctx.idx:]):
1500
+ progress = idx
1501
+ ctx.add_origin_buf(c)
1502
+ if self.is_new_line_char(c):
1503
+ break
1504
+
1505
+ value = ctx.source(ctx.idx, ctx.idx + progress).rstrip(' ')
1506
+ comment_value_index = value.find('#')
1507
+ opt = value
1508
+ if comment_value_index > 0:
1509
+ opt = value[:comment_value_index]
1510
+
1511
+ opt = trim_right_func(opt, lambda r: r == ' ' or r == '\t')
1512
+
1513
+ if len(opt) != 0:
1514
+ if (err := self.validate_multi_line_header_option(opt)) is not None:
1515
+ invalid_tk = YamlTokenMakers.new_invalid(yaml_error(str(err)), ctx.obuf, self.pos())
1516
+ self.progress_column(ctx, progress)
1517
+ return err_invalid_token(invalid_tk)
1518
+
1519
+ if self.column == 1:
1520
+ self.last_delim_column = 1
1521
+
1522
+ comment_index = ctx.obuf.find('#')
1523
+ header_buf = ctx.obuf
1524
+ if comment_index > 0:
1525
+ header_buf = header_buf[:comment_index]
1526
+
1527
+ if header == '|':
1528
+ ctx.add_token(YamlTokenMakers.new_literal('|' + opt, header_buf, self.pos()))
1529
+ ctx.set_literal(self.last_delim_column, opt)
1530
+ elif header == '>':
1531
+ ctx.add_token(YamlTokenMakers.new_folded('>' + opt, header_buf, self.pos()))
1532
+ ctx.set_folded(self.last_delim_column, opt)
1533
+
1534
+ if comment_index > 0:
1535
+ comment = value[comment_value_index + 1:]
1536
+ self.offset += len(header_buf)
1537
+ self.column += len(header_buf)
1538
+ ctx.add_token(YamlTokenMakers.new_comment(comment, ctx.obuf[len(header_buf):], self.pos()))
1539
+
1540
+ self.indent_state = YamlIndentState.KEEP
1541
+ ctx.reset_buffer()
1542
+ self.progress_column(ctx, progress)
1543
+ return None
1544
+
1545
+ def scan_map_key(self, ctx: YamlScanningContext) -> bool:
1546
+ if ctx.exists_buffer():
1547
+ return False
1548
+
1549
+ nc = ctx.next_char()
1550
+ if nc != ' ' and nc != '\t':
1551
+ return False
1552
+
1553
+ tk = YamlTokenMakers.new_mapping_key(self.pos())
1554
+ self.last_delim_column = tk.position.column
1555
+ ctx.add_token(tk)
1556
+ self.progress_column(ctx, 1)
1557
+ ctx.clear()
1558
+ return True
1559
+
1560
+ def scan_directive(self, ctx: YamlScanningContext) -> bool:
1561
+ if ctx.exists_buffer():
1562
+ return False
1563
+ if self.indent_num != 0:
1564
+ return False
1565
+
1566
+ self.add_buffered_token_if_exists(ctx)
1567
+ ctx.add_origin_buf('%')
1568
+ ctx.add_token(YamlTokenMakers.new_directive(ctx.obuf, self.pos()))
1569
+ self.progress_column(ctx, 1)
1570
+ ctx.clear()
1571
+ self.is_directive = True
1572
+ return True
1573
+
1574
+ def scan_anchor(self, ctx: YamlScanningContext) -> bool:
1575
+ if ctx.exists_buffer():
1576
+ return False
1577
+
1578
+ self.add_buffered_token_if_exists(ctx)
1579
+ ctx.add_origin_buf('&')
1580
+ ctx.add_token(YamlTokenMakers.new_anchor(ctx.obuf, self.pos()))
1581
+ self.progress_column(ctx, 1)
1582
+ self.is_anchor = True
1583
+ ctx.clear()
1584
+ return True
1585
+
1586
+ def scan_alias(self, ctx: YamlScanningContext) -> bool:
1587
+ if ctx.exists_buffer():
1588
+ return False
1589
+
1590
+ self.add_buffered_token_if_exists(ctx)
1591
+ ctx.add_origin_buf('*')
1592
+ ctx.add_token(YamlTokenMakers.new_alias(ctx.obuf, self.pos()))
1593
+ self.progress_column(ctx, 1)
1594
+ self.is_alias = True
1595
+ ctx.clear()
1596
+ return True
1597
+
1598
+ def scan_reserved_char(self, ctx: YamlScanningContext, c: str) -> ta.Optional[YamlError]:
1599
+ if ctx.exists_buffer():
1600
+ return None
1601
+
1602
+ ctx.add_buf(c)
1603
+ ctx.add_origin_buf(c)
1604
+ err = err_invalid_token(
1605
+ YamlTokenMakers.new_invalid(
1606
+ yaml_error(f'{c!r} is a reserved character'),
1607
+ ctx.obuf,
1608
+ self.pos(),
1609
+ ),
1610
+ )
1611
+ self.progress_column(ctx, 1)
1612
+ ctx.clear()
1613
+ return err
1614
+
1615
+ def scan_tab(self, ctx: YamlScanningContext, c: str) -> ta.Optional[YamlError]:
1616
+ if self.started_flow_sequence_num > 0 or self.started_flow_map_num > 0:
1617
+ # tabs character is allowed in flow mode.
1618
+ return None
1619
+
1620
+ if not self.is_first_char_at_line:
1621
+ return None
1622
+
1623
+ ctx.add_buf(c)
1624
+ ctx.add_origin_buf(c)
1625
+ err = err_invalid_token(
1626
+ YamlTokenMakers.new_invalid(
1627
+ yaml_error("found character '\t' that cannot start any token"),
1628
+ ctx.obuf,
1629
+ self.pos(),
1630
+ ),
1631
+ )
1632
+ self.progress_column(ctx, 1)
1633
+ ctx.clear()
1634
+ return err
1635
+
1636
+ def _scan(self, ctx: YamlScanningContext) -> ta.Optional[YamlError]:
1637
+ while ctx.next():
1638
+ c = ctx.current_char()
1639
+ # First, change the IndentState.
1640
+ # If the target character is the first character in a line, IndentState is Up/Down/Equal state.
1641
+ # The second and subsequent letters are Keep.
1642
+ self.update_indent(ctx, c)
1643
+
1644
+ # If IndentState is down, tokens are split, so the buffer accumulated until that point needs to be cutted as
1645
+ # a token.
1646
+ if self.is_changed_to_indent_state_down():
1647
+ self.add_buffered_token_if_exists(ctx)
1648
+
1649
+ if ctx.is_multi_line():
1650
+ if self.is_changed_to_indent_state_down():
1651
+ if (tk := ctx.last_token()) is not None:
1652
+ # If literal/folded content is empty, no string token is added.
1653
+ # Therefore, add an empty string token.
1654
+ # But if literal/folded token column is 1, it is invalid at down state.
1655
+ if tk.position.column == 1:
1656
+ return yaml_error(err_invalid_token(
1657
+ YamlTokenMakers.new_invalid(
1658
+ yaml_error('could not find multi-line content'),
1659
+ ctx.obuf,
1660
+ self.pos(),
1661
+ ),
1662
+ ))
1663
+
1664
+ if tk.type != YamlTokenType.STRING:
1665
+ ctx.add_token(YamlTokenMakers.new_string('', '', self.pos()))
1666
+
1667
+ self.break_multi_line(ctx)
1668
+
1669
+ else:
1670
+ if (err := self.scan_multi_line(ctx, c)) is not None:
1671
+ return err
1672
+
1673
+ continue
1674
+
1675
+ if c == '{':
1676
+ if self.scan_flow_map_start(ctx):
1677
+ continue
1678
+
1679
+ elif c == '}':
1680
+ if self.scan_flow_map_end(ctx):
1681
+ continue
1682
+
1683
+ elif c == '.':
1684
+ if self.scan_document_end(ctx):
1685
+ continue
1686
+
1687
+ elif c == '<':
1688
+ if self.scan_merge_key(ctx):
1689
+ continue
1690
+
1691
+ elif c == '-':
1692
+ if self.scan_document_start(ctx):
1693
+ continue
1694
+
1695
+ if self.scan_raw_folded_char(ctx):
1696
+ continue
1697
+
1698
+ scanned = self.scan_sequence(ctx)
1699
+ if isinstance(scanned, YamlError):
1700
+ return scanned
1701
+
1702
+ if scanned:
1703
+ continue
1704
+
1705
+ elif c == '[':
1706
+ if self.scan_flow_array_start(ctx):
1707
+ continue
1708
+
1709
+ elif c == ']':
1710
+ if self.scan_flow_array_end(ctx):
1711
+ continue
1712
+
1713
+ elif c == ',':
1714
+ if self.scan_flow_entry(ctx, c):
1715
+ continue
1716
+
1717
+ elif c == ':':
1718
+ scanned = self.scan_map_delim(ctx)
1719
+ if isinstance(scanned, YamlError):
1720
+ return scanned
1721
+
1722
+ if scanned:
1723
+ continue
1724
+
1725
+ elif c in ('|', '>'):
1726
+ scanned = self.scan_multi_line_header(ctx)
1727
+ if isinstance(scanned, YamlError):
1728
+ return scanned
1729
+
1730
+ if scanned:
1731
+ continue
1732
+
1733
+ elif c == '!':
1734
+ scanned = self.scan_tag(ctx)
1735
+ if isinstance(scanned, YamlError):
1736
+ return scanned
1737
+
1738
+ if scanned:
1739
+ continue
1740
+
1741
+ elif c == '%':
1742
+ if self.scan_directive(ctx):
1743
+ continue
1744
+
1745
+ elif c == '?':
1746
+ if self.scan_map_key(ctx):
1747
+ continue
1748
+
1749
+ elif c == '&':
1750
+ if self.scan_anchor(ctx):
1751
+ continue
1752
+
1753
+ elif c == '*':
1754
+ if self.scan_alias(ctx):
1755
+ continue
1756
+
1757
+ elif c == '#':
1758
+ if self.scan_comment(ctx):
1759
+ continue
1760
+
1761
+ elif c in ("'", '"'):
1762
+ scanned = self.scan_quote(ctx, c)
1763
+ if isinstance(scanned, YamlError):
1764
+ return scanned
1765
+
1766
+ if scanned:
1767
+ continue
1768
+
1769
+ elif c in ('\r', '\n'):
1770
+ self.scan_new_line(ctx, c)
1771
+ continue
1772
+
1773
+ elif c == ' ':
1774
+ if self.scan_white_space(ctx):
1775
+ continue
1776
+
1777
+ elif c in ('@', '`'):
1778
+ if (err := self.scan_reserved_char(ctx, c)) is not None:
1779
+ return err
1780
+
1781
+ elif c == '\t':
1782
+ if ctx.exists_buffer() and self.last_delim_column == 0:
1783
+ # tab indent for plain text (yaml-test-suite's spec-example-7-12-plain-lines).
1784
+ self.indent_num += 1
1785
+ ctx.add_origin_buf(c)
1786
+ self.progress_only(ctx, 1)
1787
+ continue
1788
+
1789
+ if self.last_delim_column < self.column:
1790
+ self.indent_num += 1
1791
+ ctx.add_origin_buf(c)
1792
+ self.progress_only(ctx, 1)
1793
+ continue
1794
+
1795
+ if (err := self.scan_tab(ctx, c)) is not None:
1796
+ return err
1797
+
1798
+ ctx.add_buf(c)
1799
+ ctx.add_origin_buf(c)
1800
+ self.progress_column(ctx, 1)
1801
+
1802
+ self.add_buffered_token_if_exists(ctx)
1803
+ return None
1804
+
1805
+ # init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
1806
+ def init(self, text: str) -> None:
1807
+ src = text
1808
+ self.source = src
1809
+ self.source_pos = 0
1810
+ self.source_size = len(src)
1811
+ self.line = 1
1812
+ self.column = 1
1813
+ self.offset = 1
1814
+ self.is_first_char_at_line = True
1815
+ self.clear_state()
1816
+
1817
+ def clear_state(self) -> None:
1818
+ self.prev_line_indent_num = 0
1819
+ self.last_delim_column = 0
1820
+ self.indent_level = 0
1821
+ self.indent_num = 0
1822
+
1823
+ # scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
1824
+ def scan(self) -> ta.Tuple[ta.Optional[YamlTokens], ta.Optional[YamlError]]:
1825
+ if self.source_pos >= self.source_size:
1826
+ return None, EofYamlError()
1827
+
1828
+ ctx = YamlScanningContext.new(self.source[self.source_pos:])
1829
+
1830
+ lst = YamlTokens()
1831
+ err = self._scan(ctx)
1832
+ lst.extend(ctx.tokens)
1833
+
1834
+ if err is not None:
1835
+ # var invalidTokenErr *InvalidTokenError
1836
+ # if errors.As(err, &invalidTokenErr):
1837
+ # lst = append(lst, invalidTokenErr.Token)
1838
+ return lst, err
1839
+
1840
+ return lst, None
1841
+
1842
+
1843
+ # Tokenize split to token instances from string
1844
+ def yaml_tokenize(src: str) -> YamlTokens:
1845
+ s = YamlScanner()
1846
+ s.init(src)
1847
+
1848
+ tks = YamlTokens()
1849
+ while True:
1850
+ sub_tokens, err = s.scan()
1851
+ if isinstance(err, EofYamlError):
1852
+ break
1853
+
1854
+ tks.add(*check.not_none(sub_tokens))
1855
+
1856
+ return tks
1857
+
1858
+
1859
+ ##
1860
+
1861
+
1862
+ def hex_to_int(s: str) -> int:
1863
+ if len(s) != 1:
1864
+ raise ValueError(s)
1865
+ b = s[0]
1866
+ if 'A' <= b <= 'F':
1867
+ return ord(b) - ord('A') + 10
1868
+ if 'a' <= b <= 'f':
1869
+ return ord(b) - ord('a') + 10
1870
+ return ord(b) - ord('0')
1871
+
1872
+
1873
+ def hex_runes_to_int(b: str) -> int:
1874
+ n = 0
1875
+ for i in range(len(b)):
1876
+ n += hex_to_int(b[i]) << ((len(b) - i - 1) * 4)
1877
+ return n
1878
+
1879
+
1880
+ def trim_right_func(s: str, predicate: ta.Callable[[str], bool]) -> str:
1881
+ if not s:
1882
+ return s
1883
+
1884
+ i = len(s) - 1
1885
+ while i >= 0 and predicate(s[i]):
1886
+ i -= 1
1887
+
1888
+ return s[:i + 1]